Actual source code: sbaij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
11: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12: const PetscInt *idx;
13: PetscBT table_out, table_in;
15: PetscFunctionBegin;
16: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscCall(PetscBTCreate(mbs, &table_out));
22: PetscCall(PetscMalloc1(mbs + 1, &nidx));
23: PetscCall(PetscBTCreate(mbs, &table_in));
25: for (i = 0; i < is_max; i++) { /* for each is */
26: isz = 0;
27: PetscCall(PetscBTMemzero(mbs, table_out));
29: /* Extract the indices, assume there can be duplicate entries */
30: PetscCall(ISGetIndices(is[i], &idx));
31: PetscCall(ISGetLocalSize(is[i], &n));
33: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34: bcol_max = 0;
35: for (j = 0; j < n; ++j) {
36: brow = idx[j] / bs; /* convert the indices into block indices */
37: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38: if (!PetscBTLookupSet(table_out, brow)) {
39: nidx[isz++] = brow;
40: if (bcol_max < brow) bcol_max = brow;
41: }
42: }
43: PetscCall(ISRestoreIndices(is[i], &idx));
44: PetscCall(ISDestroy(&is[i]));
46: k = 0;
47: for (j = 0; j < ov; j++) { /* for each overlap */
48: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49: PetscCall(PetscBTMemzero(mbs, table_in));
50: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52: n = isz; /* length of the updated is[i] */
53: for (brow = 0; brow < mbs; brow++) {
54: start = ai[brow];
55: end = ai[brow + 1];
56: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l < end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out, bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l < end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in, bcol)) {
71: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
78: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79: } /* for each is */
80: PetscCall(PetscBTDestroy(&table_out));
81: PetscCall(PetscFree(nidx));
82: PetscCall(PetscBTDestroy(&table_in));
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87: Zero some ops' to avoid invalid use */
88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89: {
90: PetscFunctionBegin;
91: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92: Bseq->ops->mult = NULL;
93: Bseq->ops->multadd = NULL;
94: Bseq->ops->multtranspose = NULL;
95: Bseq->ops->multtransposeadd = NULL;
96: Bseq->ops->lufactor = NULL;
97: Bseq->ops->choleskyfactor = NULL;
98: Bseq->ops->lufactorsymbolic = NULL;
99: Bseq->ops->choleskyfactorsymbolic = NULL;
100: Bseq->ops->getinertia = NULL;
101: PetscFunctionReturn(PETSC_SUCCESS);
102: }
104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108: Mat_SeqBAIJ *d = NULL;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
119: PetscCall(ISGetIndices(isrow, &irow));
120: PetscCall(ISGetIndices(iscol, &icol));
121: PetscCall(ISGetLocalSize(isrow, &nrows));
122: PetscCall(ISGetLocalSize(iscol, &ncols));
124: PetscCall(PetscCalloc1(1 + oldcols, &smap));
125: ssmap = smap;
126: PetscCall(PetscMalloc1(1 + nrows, &lens));
127: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128: /* determine lens of each row */
129: for (i = 0; i < nrows; i++) {
130: kstart = ai[irow[i]];
131: kend = kstart + a->ilen[irow[i]];
132: lens[i] = 0;
133: for (k = kstart; k < kend; k++) {
134: if (ssmap[aj[k]]) lens[i]++;
135: }
136: }
137: /* Create and fill new matrix */
138: if (scall == MAT_REUSE_MATRIX) {
139: if (sym) {
140: c = (Mat_SeqSBAIJ *)((*B)->data);
142: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145: PetscCall(PetscArrayzero(c->ilen, c->mbs));
146: } else {
147: d = (Mat_SeqBAIJ *)((*B)->data);
149: PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150: PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152: PetscCall(PetscArrayzero(d->ilen, d->mbs));
153: }
154: C = *B;
155: } else {
156: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158: if (sym) {
159: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161: } else {
162: PetscCall(MatSetType(C, MATSEQBAIJ));
163: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164: }
165: }
166: if (sym) c = (Mat_SeqSBAIJ *)C->data;
167: else d = (Mat_SeqBAIJ *)C->data;
168: for (i = 0; i < nrows; i++) {
169: row = irow[i];
170: kstart = ai[row];
171: kend = kstart + a->ilen[row];
172: if (sym) {
173: mat_i = c->i[i];
174: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
175: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176: mat_ilen = c->ilen + i;
177: } else {
178: mat_i = d->i[i];
179: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
180: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181: mat_ilen = d->ilen + i;
182: }
183: for (k = kstart; k < kend; k++) {
184: if ((tcol = ssmap[a->j[k]])) {
185: *mat_j++ = tcol - 1;
186: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187: mat_a += bs2;
188: (*mat_ilen)++;
189: }
190: }
191: }
192: /* sort */
193: {
194: MatScalar *work;
196: PetscCall(PetscMalloc1(bs2, &work));
197: for (i = 0; i < nrows; i++) {
198: PetscInt ilen;
199: if (sym) {
200: mat_i = c->i[i];
201: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203: ilen = c->ilen[i];
204: } else {
205: mat_i = d->i[i];
206: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208: ilen = d->ilen[i];
209: }
210: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211: }
212: PetscCall(PetscFree(work));
213: }
215: /* Free work space */
216: PetscCall(ISRestoreIndices(iscol, &icol));
217: PetscCall(PetscFree(smap));
218: PetscCall(PetscFree(lens));
219: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
222: PetscCall(ISRestoreIndices(isrow, &irow));
223: *B = C;
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228: {
229: Mat C[2];
230: IS is1, is2, intersect = NULL;
231: PetscInt n1, n2, ni;
232: PetscBool sym = PETSC_TRUE;
234: PetscFunctionBegin;
235: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236: if (isrow == iscol) {
237: is2 = is1;
238: PetscCall(PetscObjectReference((PetscObject)is2));
239: } else {
240: PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241: PetscCall(ISIntersect(is1, is2, &intersect));
242: PetscCall(ISGetLocalSize(intersect, &ni));
243: PetscCall(ISDestroy(&intersect));
244: if (ni == 0) sym = PETSC_FALSE;
245: else if (PetscDefined(USE_DEBUG)) {
246: PetscCall(ISGetLocalSize(is1, &n1));
247: PetscCall(ISGetLocalSize(is2, &n2));
248: PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249: }
250: }
251: if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252: else {
253: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255: PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256: PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257: PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258: if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259: else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260: else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261: PetscCall(MatDestroy(C));
262: PetscCall(MatDestroy(C + 1));
263: }
264: PetscCall(ISDestroy(&is1));
265: PetscCall(ISDestroy(&is2));
267: if (sym && isrow != iscol) {
268: PetscBool isequal;
269: PetscCall(ISEqual(isrow, iscol, &isequal));
270: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271: }
272: PetscFunctionReturn(PETSC_SUCCESS);
273: }
275: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276: {
277: PetscInt i;
279: PetscFunctionBegin;
280: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
282: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283: PetscFunctionReturn(PETSC_SUCCESS);
284: }
286: /* Should check that shapes of vectors and matrices match */
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
290: PetscScalar *z, x1, x2, zero = 0.0;
291: const PetscScalar *x, *xb;
292: const MatScalar *v;
293: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
294: const PetscInt *aj = a->j, *ai = a->i, *ib;
295: PetscInt nonzerorow = 0;
297: PetscFunctionBegin;
298: PetscCall(VecSet(zz, zero));
299: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300: PetscCall(VecGetArrayRead(xx, &x));
301: PetscCall(VecGetArray(zz, &z));
303: v = a->a;
304: xb = x;
306: for (i = 0; i < mbs; i++) {
307: n = ai[1] - ai[0]; /* length of i_th block row of A */
308: x1 = xb[0];
309: x2 = xb[1];
310: ib = aj + *ai;
311: jmin = 0;
312: nonzerorow += (n > 0);
313: if (*ib == i) { /* (diag of A)*x */
314: z[2 * i] += v[0] * x1 + v[2] * x2;
315: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316: v += 4;
317: jmin++;
318: }
319: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321: for (j = jmin; j < n; j++) {
322: /* (strict lower triangular part of A)*x */
323: cval = ib[j] * 2;
324: z[cval] += v[0] * x1 + v[1] * x2;
325: z[cval + 1] += v[2] * x1 + v[3] * x2;
326: /* (strict upper triangular part of A)*x */
327: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329: v += 4;
330: }
331: xb += 2;
332: ai++;
333: }
335: PetscCall(VecRestoreArrayRead(xx, &x));
336: PetscCall(VecRestoreArray(zz, &z));
337: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338: PetscFunctionReturn(PETSC_SUCCESS);
339: }
341: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342: {
343: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
344: PetscScalar *z, x1, x2, x3, zero = 0.0;
345: const PetscScalar *x, *xb;
346: const MatScalar *v;
347: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
348: const PetscInt *aj = a->j, *ai = a->i, *ib;
349: PetscInt nonzerorow = 0;
351: PetscFunctionBegin;
352: PetscCall(VecSet(zz, zero));
353: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354: PetscCall(VecGetArrayRead(xx, &x));
355: PetscCall(VecGetArray(zz, &z));
357: v = a->a;
358: xb = x;
360: for (i = 0; i < mbs; i++) {
361: n = ai[1] - ai[0]; /* length of i_th block row of A */
362: x1 = xb[0];
363: x2 = xb[1];
364: x3 = xb[2];
365: ib = aj + *ai;
366: jmin = 0;
367: nonzerorow += (n > 0);
368: if (*ib == i) { /* (diag of A)*x */
369: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372: v += 9;
373: jmin++;
374: }
375: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377: for (j = jmin; j < n; j++) {
378: /* (strict lower triangular part of A)*x */
379: cval = ib[j] * 3;
380: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383: /* (strict upper triangular part of A)*x */
384: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387: v += 9;
388: }
389: xb += 3;
390: ai++;
391: }
393: PetscCall(VecRestoreArrayRead(xx, &x));
394: PetscCall(VecRestoreArray(zz, &z));
395: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400: {
401: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
402: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
403: const PetscScalar *x, *xb;
404: const MatScalar *v;
405: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
406: const PetscInt *aj = a->j, *ai = a->i, *ib;
407: PetscInt nonzerorow = 0;
409: PetscFunctionBegin;
410: PetscCall(VecSet(zz, zero));
411: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412: PetscCall(VecGetArrayRead(xx, &x));
413: PetscCall(VecGetArray(zz, &z));
415: v = a->a;
416: xb = x;
418: for (i = 0; i < mbs; i++) {
419: n = ai[1] - ai[0]; /* length of i_th block row of A */
420: x1 = xb[0];
421: x2 = xb[1];
422: x3 = xb[2];
423: x4 = xb[3];
424: ib = aj + *ai;
425: jmin = 0;
426: nonzerorow += (n > 0);
427: if (*ib == i) { /* (diag of A)*x */
428: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432: v += 16;
433: jmin++;
434: }
435: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
436: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437: for (j = jmin; j < n; j++) {
438: /* (strict lower triangular part of A)*x */
439: cval = ib[j] * 4;
440: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444: /* (strict upper triangular part of A)*x */
445: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449: v += 16;
450: }
451: xb += 4;
452: ai++;
453: }
455: PetscCall(VecRestoreArrayRead(xx, &x));
456: PetscCall(VecRestoreArray(zz, &z));
457: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458: PetscFunctionReturn(PETSC_SUCCESS);
459: }
461: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462: {
463: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
464: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
465: const PetscScalar *x, *xb;
466: const MatScalar *v;
467: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
468: const PetscInt *aj = a->j, *ai = a->i, *ib;
469: PetscInt nonzerorow = 0;
471: PetscFunctionBegin;
472: PetscCall(VecSet(zz, zero));
473: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474: PetscCall(VecGetArrayRead(xx, &x));
475: PetscCall(VecGetArray(zz, &z));
477: v = a->a;
478: xb = x;
480: for (i = 0; i < mbs; i++) {
481: n = ai[1] - ai[0]; /* length of i_th block row of A */
482: x1 = xb[0];
483: x2 = xb[1];
484: x3 = xb[2];
485: x4 = xb[3];
486: x5 = xb[4];
487: ib = aj + *ai;
488: jmin = 0;
489: nonzerorow += (n > 0);
490: if (*ib == i) { /* (diag of A)*x */
491: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496: v += 25;
497: jmin++;
498: }
499: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
500: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501: for (j = jmin; j < n; j++) {
502: /* (strict lower triangular part of A)*x */
503: cval = ib[j] * 5;
504: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509: /* (strict upper triangular part of A)*x */
510: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515: v += 25;
516: }
517: xb += 5;
518: ai++;
519: }
521: PetscCall(VecRestoreArrayRead(xx, &x));
522: PetscCall(VecRestoreArray(zz, &z));
523: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524: PetscFunctionReturn(PETSC_SUCCESS);
525: }
527: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528: {
529: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
530: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531: const PetscScalar *x, *xb;
532: const MatScalar *v;
533: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
534: const PetscInt *aj = a->j, *ai = a->i, *ib;
535: PetscInt nonzerorow = 0;
537: PetscFunctionBegin;
538: PetscCall(VecSet(zz, zero));
539: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540: PetscCall(VecGetArrayRead(xx, &x));
541: PetscCall(VecGetArray(zz, &z));
543: v = a->a;
544: xb = x;
546: for (i = 0; i < mbs; i++) {
547: n = ai[1] - ai[0]; /* length of i_th block row of A */
548: x1 = xb[0];
549: x2 = xb[1];
550: x3 = xb[2];
551: x4 = xb[3];
552: x5 = xb[4];
553: x6 = xb[5];
554: ib = aj + *ai;
555: jmin = 0;
556: nonzerorow += (n > 0);
557: if (*ib == i) { /* (diag of A)*x */
558: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564: v += 36;
565: jmin++;
566: }
567: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
568: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569: for (j = jmin; j < n; j++) {
570: /* (strict lower triangular part of A)*x */
571: cval = ib[j] * 6;
572: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578: /* (strict upper triangular part of A)*x */
579: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585: v += 36;
586: }
587: xb += 6;
588: ai++;
589: }
591: PetscCall(VecRestoreArrayRead(xx, &x));
592: PetscCall(VecRestoreArray(zz, &z));
593: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594: PetscFunctionReturn(PETSC_SUCCESS);
595: }
597: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598: {
599: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
600: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601: const PetscScalar *x, *xb;
602: const MatScalar *v;
603: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
604: const PetscInt *aj = a->j, *ai = a->i, *ib;
605: PetscInt nonzerorow = 0;
607: PetscFunctionBegin;
608: PetscCall(VecSet(zz, zero));
609: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610: PetscCall(VecGetArrayRead(xx, &x));
611: PetscCall(VecGetArray(zz, &z));
613: v = a->a;
614: xb = x;
616: for (i = 0; i < mbs; i++) {
617: n = ai[1] - ai[0]; /* length of i_th block row of A */
618: x1 = xb[0];
619: x2 = xb[1];
620: x3 = xb[2];
621: x4 = xb[3];
622: x5 = xb[4];
623: x6 = xb[5];
624: x7 = xb[6];
625: ib = aj + *ai;
626: jmin = 0;
627: nonzerorow += (n > 0);
628: if (*ib == i) { /* (diag of A)*x */
629: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636: v += 49;
637: jmin++;
638: }
639: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
640: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641: for (j = jmin; j < n; j++) {
642: /* (strict lower triangular part of A)*x */
643: cval = ib[j] * 7;
644: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651: /* (strict upper triangular part of A)*x */
652: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659: v += 49;
660: }
661: xb += 7;
662: ai++;
663: }
664: PetscCall(VecRestoreArrayRead(xx, &x));
665: PetscCall(VecRestoreArray(zz, &z));
666: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: /*
671: This will not work with MatScalar == float because it calls the BLAS
672: */
673: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674: {
675: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
676: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677: const PetscScalar *x, *x_ptr, *xb;
678: const MatScalar *v;
679: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680: const PetscInt *idx, *aj, *ii;
681: PetscInt nonzerorow = 0;
683: PetscFunctionBegin;
684: PetscCall(VecSet(zz, zero));
685: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686: PetscCall(VecGetArrayRead(xx, &x));
687: PetscCall(VecGetArray(zz, &z));
689: x_ptr = x;
690: z_ptr = z;
692: aj = a->j;
693: v = a->a;
694: ii = a->i;
696: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697: work = a->mult_work;
699: for (i = 0; i < mbs; i++) {
700: n = ii[1] - ii[0];
701: ncols = n * bs;
702: workt = work;
703: idx = aj + ii[0];
704: nonzerorow += (n > 0);
706: /* upper triangular part */
707: for (j = 0; j < n; j++) {
708: xb = x_ptr + bs * (*idx++);
709: for (k = 0; k < bs; k++) workt[k] = xb[k];
710: workt += bs;
711: }
712: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
715: /* strict lower triangular part */
716: idx = aj + ii[0];
717: if (n && *idx == i) {
718: ncols -= bs;
719: v += bs2;
720: idx++;
721: n--;
722: }
724: if (ncols > 0) {
725: workt = work;
726: PetscCall(PetscArrayzero(workt, ncols));
727: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728: for (j = 0; j < n; j++) {
729: zb = z_ptr + bs * (*idx++);
730: for (k = 0; k < bs; k++) zb[k] += workt[k];
731: workt += bs;
732: }
733: }
734: x += bs;
735: v += n * bs2;
736: z += bs;
737: ii++;
738: }
740: PetscCall(VecRestoreArrayRead(xx, &x));
741: PetscCall(VecRestoreArray(zz, &z));
742: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743: PetscFunctionReturn(PETSC_SUCCESS);
744: }
746: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747: {
748: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
749: PetscScalar *z, x1;
750: const PetscScalar *x, *xb;
751: const MatScalar *v;
752: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
753: const PetscInt *aj = a->j, *ai = a->i, *ib;
754: PetscInt nonzerorow = 0;
755: const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
757: PetscFunctionBegin;
758: PetscCall(VecCopy(yy, zz));
759: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
760: PetscCall(VecGetArrayRead(xx, &x));
761: PetscCall(VecGetArray(zz, &z));
762: v = a->a;
763: xb = x;
765: for (i = 0; i < mbs; i++) {
766: n = ai[1] - ai[0]; /* length of i_th row of A */
767: x1 = xb[0];
768: ib = aj + *ai;
769: jmin = 0;
770: nonzerorow += (n > 0);
771: if (n && *ib == i) { /* (diag of A)*x */
772: z[i] += *v++ * x[*ib++];
773: jmin++;
774: }
775: if (aconj) {
776: for (j = jmin; j < n; j++) {
777: cval = *ib;
778: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
779: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
780: }
781: } else {
782: for (j = jmin; j < n; j++) {
783: cval = *ib;
784: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
785: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
786: }
787: }
788: xb++;
789: ai++;
790: }
792: PetscCall(VecRestoreArrayRead(xx, &x));
793: PetscCall(VecRestoreArray(zz, &z));
795: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
796: PetscFunctionReturn(PETSC_SUCCESS);
797: }
799: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
800: {
801: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
802: PetscScalar *z, x1, x2;
803: const PetscScalar *x, *xb;
804: const MatScalar *v;
805: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
806: const PetscInt *aj = a->j, *ai = a->i, *ib;
807: PetscInt nonzerorow = 0;
809: PetscFunctionBegin;
810: PetscCall(VecCopy(yy, zz));
811: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
812: PetscCall(VecGetArrayRead(xx, &x));
813: PetscCall(VecGetArray(zz, &z));
815: v = a->a;
816: xb = x;
818: for (i = 0; i < mbs; i++) {
819: n = ai[1] - ai[0]; /* length of i_th block row of A */
820: x1 = xb[0];
821: x2 = xb[1];
822: ib = aj + *ai;
823: jmin = 0;
824: nonzerorow += (n > 0);
825: if (n && *ib == i) { /* (diag of A)*x */
826: z[2 * i] += v[0] * x1 + v[2] * x2;
827: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
828: v += 4;
829: jmin++;
830: }
831: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
832: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
833: for (j = jmin; j < n; j++) {
834: /* (strict lower triangular part of A)*x */
835: cval = ib[j] * 2;
836: z[cval] += v[0] * x1 + v[1] * x2;
837: z[cval + 1] += v[2] * x1 + v[3] * x2;
838: /* (strict upper triangular part of A)*x */
839: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
840: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
841: v += 4;
842: }
843: xb += 2;
844: ai++;
845: }
846: PetscCall(VecRestoreArrayRead(xx, &x));
847: PetscCall(VecRestoreArray(zz, &z));
849: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
850: PetscFunctionReturn(PETSC_SUCCESS);
851: }
853: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
854: {
855: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
856: PetscScalar *z, x1, x2, x3;
857: const PetscScalar *x, *xb;
858: const MatScalar *v;
859: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
860: const PetscInt *aj = a->j, *ai = a->i, *ib;
861: PetscInt nonzerorow = 0;
863: PetscFunctionBegin;
864: PetscCall(VecCopy(yy, zz));
865: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
866: PetscCall(VecGetArrayRead(xx, &x));
867: PetscCall(VecGetArray(zz, &z));
869: v = a->a;
870: xb = x;
872: for (i = 0; i < mbs; i++) {
873: n = ai[1] - ai[0]; /* length of i_th block row of A */
874: x1 = xb[0];
875: x2 = xb[1];
876: x3 = xb[2];
877: ib = aj + *ai;
878: jmin = 0;
879: nonzerorow += (n > 0);
880: if (n && *ib == i) { /* (diag of A)*x */
881: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
882: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
883: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
884: v += 9;
885: jmin++;
886: }
887: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
888: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
889: for (j = jmin; j < n; j++) {
890: /* (strict lower triangular part of A)*x */
891: cval = ib[j] * 3;
892: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
893: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
894: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
895: /* (strict upper triangular part of A)*x */
896: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
897: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
898: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
899: v += 9;
900: }
901: xb += 3;
902: ai++;
903: }
905: PetscCall(VecRestoreArrayRead(xx, &x));
906: PetscCall(VecRestoreArray(zz, &z));
908: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
909: PetscFunctionReturn(PETSC_SUCCESS);
910: }
912: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
913: {
914: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
915: PetscScalar *z, x1, x2, x3, x4;
916: const PetscScalar *x, *xb;
917: const MatScalar *v;
918: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
919: const PetscInt *aj = a->j, *ai = a->i, *ib;
920: PetscInt nonzerorow = 0;
922: PetscFunctionBegin;
923: PetscCall(VecCopy(yy, zz));
924: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
925: PetscCall(VecGetArrayRead(xx, &x));
926: PetscCall(VecGetArray(zz, &z));
928: v = a->a;
929: xb = x;
931: for (i = 0; i < mbs; i++) {
932: n = ai[1] - ai[0]; /* length of i_th block row of A */
933: x1 = xb[0];
934: x2 = xb[1];
935: x3 = xb[2];
936: x4 = xb[3];
937: ib = aj + *ai;
938: jmin = 0;
939: nonzerorow += (n > 0);
940: if (n && *ib == i) { /* (diag of A)*x */
941: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
942: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
943: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
944: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
945: v += 16;
946: jmin++;
947: }
948: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
949: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
950: for (j = jmin; j < n; j++) {
951: /* (strict lower triangular part of A)*x */
952: cval = ib[j] * 4;
953: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
954: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
955: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
956: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
957: /* (strict upper triangular part of A)*x */
958: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
959: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
960: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
961: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
962: v += 16;
963: }
964: xb += 4;
965: ai++;
966: }
968: PetscCall(VecRestoreArrayRead(xx, &x));
969: PetscCall(VecRestoreArray(zz, &z));
971: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
972: PetscFunctionReturn(PETSC_SUCCESS);
973: }
975: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
976: {
977: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
978: PetscScalar *z, x1, x2, x3, x4, x5;
979: const PetscScalar *x, *xb;
980: const MatScalar *v;
981: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
982: const PetscInt *aj = a->j, *ai = a->i, *ib;
983: PetscInt nonzerorow = 0;
985: PetscFunctionBegin;
986: PetscCall(VecCopy(yy, zz));
987: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
988: PetscCall(VecGetArrayRead(xx, &x));
989: PetscCall(VecGetArray(zz, &z));
991: v = a->a;
992: xb = x;
994: for (i = 0; i < mbs; i++) {
995: n = ai[1] - ai[0]; /* length of i_th block row of A */
996: x1 = xb[0];
997: x2 = xb[1];
998: x3 = xb[2];
999: x4 = xb[3];
1000: x5 = xb[4];
1001: ib = aj + *ai;
1002: jmin = 0;
1003: nonzerorow += (n > 0);
1004: if (n && *ib == i) { /* (diag of A)*x */
1005: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1006: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1007: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1008: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1009: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1010: v += 25;
1011: jmin++;
1012: }
1013: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1014: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1015: for (j = jmin; j < n; j++) {
1016: /* (strict lower triangular part of A)*x */
1017: cval = ib[j] * 5;
1018: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1019: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1020: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1021: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1022: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1023: /* (strict upper triangular part of A)*x */
1024: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1025: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1026: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1027: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1028: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1029: v += 25;
1030: }
1031: xb += 5;
1032: ai++;
1033: }
1035: PetscCall(VecRestoreArrayRead(xx, &x));
1036: PetscCall(VecRestoreArray(zz, &z));
1038: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1039: PetscFunctionReturn(PETSC_SUCCESS);
1040: }
1042: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1043: {
1044: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1045: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1046: const PetscScalar *x, *xb;
1047: const MatScalar *v;
1048: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1049: const PetscInt *aj = a->j, *ai = a->i, *ib;
1050: PetscInt nonzerorow = 0;
1052: PetscFunctionBegin;
1053: PetscCall(VecCopy(yy, zz));
1054: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1055: PetscCall(VecGetArrayRead(xx, &x));
1056: PetscCall(VecGetArray(zz, &z));
1058: v = a->a;
1059: xb = x;
1061: for (i = 0; i < mbs; i++) {
1062: n = ai[1] - ai[0]; /* length of i_th block row of A */
1063: x1 = xb[0];
1064: x2 = xb[1];
1065: x3 = xb[2];
1066: x4 = xb[3];
1067: x5 = xb[4];
1068: x6 = xb[5];
1069: ib = aj + *ai;
1070: jmin = 0;
1071: nonzerorow += (n > 0);
1072: if (n && *ib == i) { /* (diag of A)*x */
1073: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1074: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1075: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1076: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1077: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1078: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1079: v += 36;
1080: jmin++;
1081: }
1082: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1083: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1084: for (j = jmin; j < n; j++) {
1085: /* (strict lower triangular part of A)*x */
1086: cval = ib[j] * 6;
1087: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1088: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1089: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1090: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1091: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1092: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1093: /* (strict upper triangular part of A)*x */
1094: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1095: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1096: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1097: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1098: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1099: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1100: v += 36;
1101: }
1102: xb += 6;
1103: ai++;
1104: }
1106: PetscCall(VecRestoreArrayRead(xx, &x));
1107: PetscCall(VecRestoreArray(zz, &z));
1109: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1110: PetscFunctionReturn(PETSC_SUCCESS);
1111: }
1113: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1114: {
1115: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1116: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1117: const PetscScalar *x, *xb;
1118: const MatScalar *v;
1119: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1120: const PetscInt *aj = a->j, *ai = a->i, *ib;
1121: PetscInt nonzerorow = 0;
1123: PetscFunctionBegin;
1124: PetscCall(VecCopy(yy, zz));
1125: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1126: PetscCall(VecGetArrayRead(xx, &x));
1127: PetscCall(VecGetArray(zz, &z));
1129: v = a->a;
1130: xb = x;
1132: for (i = 0; i < mbs; i++) {
1133: n = ai[1] - ai[0]; /* length of i_th block row of A */
1134: x1 = xb[0];
1135: x2 = xb[1];
1136: x3 = xb[2];
1137: x4 = xb[3];
1138: x5 = xb[4];
1139: x6 = xb[5];
1140: x7 = xb[6];
1141: ib = aj + *ai;
1142: jmin = 0;
1143: nonzerorow += (n > 0);
1144: if (n && *ib == i) { /* (diag of A)*x */
1145: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1146: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1147: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1148: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1149: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1150: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1151: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1152: v += 49;
1153: jmin++;
1154: }
1155: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1156: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1157: for (j = jmin; j < n; j++) {
1158: /* (strict lower triangular part of A)*x */
1159: cval = ib[j] * 7;
1160: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1161: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1162: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1163: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1164: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1165: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1166: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1167: /* (strict upper triangular part of A)*x */
1168: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1169: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1170: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1171: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1172: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1173: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1174: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1175: v += 49;
1176: }
1177: xb += 7;
1178: ai++;
1179: }
1181: PetscCall(VecRestoreArrayRead(xx, &x));
1182: PetscCall(VecRestoreArray(zz, &z));
1184: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1185: PetscFunctionReturn(PETSC_SUCCESS);
1186: }
1188: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1189: {
1190: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1191: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1192: const PetscScalar *x, *x_ptr, *xb;
1193: const MatScalar *v;
1194: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1195: const PetscInt *idx, *aj, *ii;
1196: PetscInt nonzerorow = 0;
1198: PetscFunctionBegin;
1199: PetscCall(VecCopy(yy, zz));
1200: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1201: PetscCall(VecGetArrayRead(xx, &x));
1202: x_ptr = x;
1203: PetscCall(VecGetArray(zz, &z));
1204: z_ptr = z;
1206: aj = a->j;
1207: v = a->a;
1208: ii = a->i;
1210: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1211: work = a->mult_work;
1213: for (i = 0; i < mbs; i++) {
1214: n = ii[1] - ii[0];
1215: ncols = n * bs;
1216: workt = work;
1217: idx = aj + ii[0];
1218: nonzerorow += (n > 0);
1220: /* upper triangular part */
1221: for (j = 0; j < n; j++) {
1222: xb = x_ptr + bs * (*idx++);
1223: for (k = 0; k < bs; k++) workt[k] = xb[k];
1224: workt += bs;
1225: }
1226: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1227: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1229: /* strict lower triangular part */
1230: idx = aj + ii[0];
1231: if (n && *idx == i) {
1232: ncols -= bs;
1233: v += bs2;
1234: idx++;
1235: n--;
1236: }
1237: if (ncols > 0) {
1238: workt = work;
1239: PetscCall(PetscArrayzero(workt, ncols));
1240: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1241: for (j = 0; j < n; j++) {
1242: zb = z_ptr + bs * (*idx++);
1243: for (k = 0; k < bs; k++) zb[k] += workt[k];
1244: workt += bs;
1245: }
1246: }
1248: x += bs;
1249: v += n * bs2;
1250: z += bs;
1251: ii++;
1252: }
1254: PetscCall(VecRestoreArrayRead(xx, &x));
1255: PetscCall(VecRestoreArray(zz, &z));
1257: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1258: PetscFunctionReturn(PETSC_SUCCESS);
1259: }
1261: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1262: {
1263: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1264: PetscScalar oalpha = alpha;
1265: PetscBLASInt one = 1, totalnz;
1267: PetscFunctionBegin;
1268: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1269: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1270: PetscCall(PetscLogFlops(totalnz));
1271: PetscFunctionReturn(PETSC_SUCCESS);
1272: }
1274: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1275: {
1276: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1277: const MatScalar *v = a->a;
1278: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1279: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1280: const PetscInt *aj = a->j, *col;
1282: PetscFunctionBegin;
1283: if (!a->nz) {
1284: *norm = 0.0;
1285: PetscFunctionReturn(PETSC_SUCCESS);
1286: }
1287: if (type == NORM_FROBENIUS) {
1288: for (k = 0; k < mbs; k++) {
1289: jmin = a->i[k];
1290: jmax = a->i[k + 1];
1291: col = aj + jmin;
1292: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1293: for (i = 0; i < bs2; i++) {
1294: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1295: v++;
1296: }
1297: jmin++;
1298: }
1299: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1300: for (i = 0; i < bs2; i++) {
1301: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1302: v++;
1303: }
1304: }
1305: }
1306: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1307: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1308: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1309: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1310: for (i = 0; i < mbs; i++) jl[i] = mbs;
1311: il[0] = 0;
1313: *norm = 0.0;
1314: for (k = 0; k < mbs; k++) { /* k_th block row */
1315: for (j = 0; j < bs; j++) sum[j] = 0.0;
1316: /*-- col sum --*/
1317: i = jl[k]; /* first |A(i,k)| to be added */
1318: /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1319: at step k */
1320: while (i < mbs) {
1321: nexti = jl[i]; /* next block row to be added */
1322: ik = il[i]; /* block index of A(i,k) in the array a */
1323: for (j = 0; j < bs; j++) {
1324: v = a->a + ik * bs2 + j * bs;
1325: for (k1 = 0; k1 < bs; k1++) {
1326: sum[j] += PetscAbsScalar(*v);
1327: v++;
1328: }
1329: }
1330: /* update il, jl */
1331: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1332: jmax = a->i[i + 1];
1333: if (jmin < jmax) {
1334: il[i] = jmin;
1335: j = a->j[jmin];
1336: jl[i] = jl[j];
1337: jl[j] = i;
1338: }
1339: i = nexti;
1340: }
1341: /*-- row sum --*/
1342: jmin = a->i[k];
1343: jmax = a->i[k + 1];
1344: for (i = jmin; i < jmax; i++) {
1345: for (j = 0; j < bs; j++) {
1346: v = a->a + i * bs2 + j;
1347: for (k1 = 0; k1 < bs; k1++) {
1348: sum[j] += PetscAbsScalar(*v);
1349: v += bs;
1350: }
1351: }
1352: }
1353: /* add k_th block row to il, jl */
1354: col = aj + jmin;
1355: if (jmax - jmin > 0 && *col == k) jmin++;
1356: if (jmin < jmax) {
1357: il[k] = jmin;
1358: j = a->j[jmin];
1359: jl[k] = jl[j];
1360: jl[j] = k;
1361: }
1362: for (j = 0; j < bs; j++) {
1363: if (sum[j] > *norm) *norm = sum[j];
1364: }
1365: }
1366: PetscCall(PetscFree3(sum, il, jl));
1367: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1368: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1369: PetscFunctionReturn(PETSC_SUCCESS);
1370: }
1372: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1373: {
1374: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1376: PetscFunctionBegin;
1377: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1378: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1379: *flg = PETSC_FALSE;
1380: PetscFunctionReturn(PETSC_SUCCESS);
1381: }
1383: /* if the a->i are the same */
1384: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1385: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1387: /* if a->j are the same */
1388: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1389: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1391: /* if a->a are the same */
1392: PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg));
1393: PetscFunctionReturn(PETSC_SUCCESS);
1394: }
1396: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1397: {
1398: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1399: PetscInt n;
1400: const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
1401: PetscScalar *x;
1402: const MatScalar *aa = a->a, *aa_j;
1403: const PetscInt *ai = a->i, *adiag;
1404: PetscBool diagDense;
1406: PetscFunctionBegin;
1407: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1408: PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense));
1409: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1410: PetscCall(VecGetArrayWrite(v, &x));
1411: for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]];
1412: PetscCall(VecRestoreArrayWrite(v, &x));
1413: PetscFunctionReturn(PETSC_SUCCESS);
1414: }
1416: PetscCall(VecGetLocalSize(v, &n));
1417: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1418: PetscCall(VecGetArrayWrite(v, &x));
1420: if (diagDense) {
1421: for (PetscInt i = 0, row = 0; i < ambs; i++) {
1422: aa_j = aa + adiag[i] * bs2;
1423: for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1424: }
1425: } else {
1426: for (PetscInt i = 0, row = 0; i < ambs; i++) {
1427: const PetscInt j = adiag[i];
1429: if (j != ai[i + 1]) {
1430: aa_j = aa + j * bs2;
1431: for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1432: } else {
1433: for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
1434: }
1435: }
1436: }
1437: PetscCall(VecRestoreArrayWrite(v, &x));
1438: PetscFunctionReturn(PETSC_SUCCESS);
1439: }
1441: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1442: {
1443: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1444: PetscScalar x;
1445: const PetscScalar *l, *li, *ri;
1446: MatScalar *aa, *v;
1447: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1448: const PetscInt *ai, *aj;
1450: PetscFunctionBegin;
1451: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1452: ai = a->i;
1453: aj = a->j;
1454: aa = a->a;
1455: m = A->rmap->N;
1456: bs = A->rmap->bs;
1457: mbs = a->mbs;
1458: bs2 = a->bs2;
1460: PetscCall(VecGetArrayRead(ll, &l));
1461: PetscCall(VecGetLocalSize(ll, &lm));
1462: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1463: for (i = 0; i < mbs; i++) { /* for each block row */
1464: M = ai[i + 1] - ai[i];
1465: li = l + i * bs;
1466: v = aa + bs2 * ai[i];
1467: for (j = 0; j < M; j++) { /* for each block */
1468: ri = l + bs * aj[ai[i] + j];
1469: for (k = 0; k < bs; k++) {
1470: x = ri[k];
1471: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1472: }
1473: }
1474: }
1475: PetscCall(VecRestoreArrayRead(ll, &l));
1476: PetscCall(PetscLogFlops(2.0 * a->nz));
1477: PetscFunctionReturn(PETSC_SUCCESS);
1478: }
1480: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1481: {
1482: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1484: PetscFunctionBegin;
1485: info->block_size = a->bs2;
1486: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1487: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1488: info->nz_unneeded = info->nz_allocated - info->nz_used;
1489: info->assemblies = A->num_ass;
1490: info->mallocs = A->info.mallocs;
1491: info->memory = 0; /* REVIEW ME */
1492: if (A->factortype) {
1493: info->fill_ratio_given = A->info.fill_ratio_given;
1494: info->fill_ratio_needed = A->info.fill_ratio_needed;
1495: info->factor_mallocs = A->info.factor_mallocs;
1496: } else {
1497: info->fill_ratio_given = 0;
1498: info->fill_ratio_needed = 0;
1499: info->factor_mallocs = 0;
1500: }
1501: PetscFunctionReturn(PETSC_SUCCESS);
1502: }
1504: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1505: {
1506: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1508: PetscFunctionBegin;
1509: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1510: PetscFunctionReturn(PETSC_SUCCESS);
1511: }
1513: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1514: {
1515: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1516: PetscInt i, j, n, row, col, bs, mbs;
1517: const PetscInt *ai, *aj;
1518: PetscReal atmp;
1519: const MatScalar *aa;
1520: PetscScalar *x;
1521: PetscInt ncols, brow, bcol, krow, kcol;
1523: PetscFunctionBegin;
1524: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1525: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1526: bs = A->rmap->bs;
1527: aa = a->a;
1528: ai = a->i;
1529: aj = a->j;
1530: mbs = a->mbs;
1532: PetscCall(VecSet(v, 0.0));
1533: PetscCall(VecGetArray(v, &x));
1534: PetscCall(VecGetLocalSize(v, &n));
1535: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1536: for (i = 0; i < mbs; i++) {
1537: ncols = ai[1] - ai[0];
1538: ai++;
1539: brow = bs * i;
1540: for (j = 0; j < ncols; j++) {
1541: bcol = bs * (*aj);
1542: for (kcol = 0; kcol < bs; kcol++) {
1543: col = bcol + kcol; /* col index */
1544: for (krow = 0; krow < bs; krow++) {
1545: atmp = PetscAbsScalar(*aa);
1546: aa++;
1547: row = brow + krow; /* row index */
1548: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1549: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1550: }
1551: }
1552: aj++;
1553: }
1554: }
1555: PetscCall(VecRestoreArray(v, &x));
1556: PetscFunctionReturn(PETSC_SUCCESS);
1557: }
1559: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1560: {
1561: PetscFunctionBegin;
1562: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1563: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1564: PetscFunctionReturn(PETSC_SUCCESS);
1565: }
1567: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1568: {
1569: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1570: PetscScalar *z = c;
1571: const PetscScalar *xb;
1572: PetscScalar x1;
1573: const MatScalar *v = a->a, *vv;
1574: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1575: const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
1577: PetscFunctionBegin;
1578: for (i = 0; i < mbs; i++) {
1579: n = ii[1] - ii[0];
1580: ii++;
1581: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583: jj = idx;
1584: vv = v;
1585: for (k = 0; k < cn; k++) {
1586: idx = jj;
1587: v = vv;
1588: for (j = 0; j < n; j++) {
1589: xb = b + (*idx);
1590: x1 = xb[0 + k * bm];
1591: z[0 + k * cm] += v[0] * x1;
1592: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1593: v += 1;
1594: ++idx;
1595: }
1596: }
1597: z += 1;
1598: }
1599: PetscFunctionReturn(PETSC_SUCCESS);
1600: }
1602: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1603: {
1604: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1605: PetscScalar *z = c;
1606: const PetscScalar *xb;
1607: PetscScalar x1, x2;
1608: const MatScalar *v = a->a, *vv;
1609: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1611: PetscFunctionBegin;
1612: for (i = 0; i < mbs; i++) {
1613: n = ii[1] - ii[0];
1614: ii++;
1615: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1616: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1617: jj = idx;
1618: vv = v;
1619: for (k = 0; k < cn; k++) {
1620: idx = jj;
1621: v = vv;
1622: for (j = 0; j < n; j++) {
1623: xb = b + 2 * (*idx);
1624: x1 = xb[0 + k * bm];
1625: x2 = xb[1 + k * bm];
1626: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1627: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1628: if (*idx != i) {
1629: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1630: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1631: }
1632: v += 4;
1633: ++idx;
1634: }
1635: }
1636: z += 2;
1637: }
1638: PetscFunctionReturn(PETSC_SUCCESS);
1639: }
1641: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1642: {
1643: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1644: PetscScalar *z = c;
1645: const PetscScalar *xb;
1646: PetscScalar x1, x2, x3;
1647: const MatScalar *v = a->a, *vv;
1648: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1650: PetscFunctionBegin;
1651: for (i = 0; i < mbs; i++) {
1652: n = ii[1] - ii[0];
1653: ii++;
1654: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1655: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1656: jj = idx;
1657: vv = v;
1658: for (k = 0; k < cn; k++) {
1659: idx = jj;
1660: v = vv;
1661: for (j = 0; j < n; j++) {
1662: xb = b + 3 * (*idx);
1663: x1 = xb[0 + k * bm];
1664: x2 = xb[1 + k * bm];
1665: x3 = xb[2 + k * bm];
1666: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1667: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1668: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1669: if (*idx != i) {
1670: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1671: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1672: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1673: }
1674: v += 9;
1675: ++idx;
1676: }
1677: }
1678: z += 3;
1679: }
1680: PetscFunctionReturn(PETSC_SUCCESS);
1681: }
1683: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1684: {
1685: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1686: PetscScalar *z = c;
1687: const PetscScalar *xb;
1688: PetscScalar x1, x2, x3, x4;
1689: const MatScalar *v = a->a, *vv;
1690: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1692: PetscFunctionBegin;
1693: for (i = 0; i < mbs; i++) {
1694: n = ii[1] - ii[0];
1695: ii++;
1696: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1697: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1698: jj = idx;
1699: vv = v;
1700: for (k = 0; k < cn; k++) {
1701: idx = jj;
1702: v = vv;
1703: for (j = 0; j < n; j++) {
1704: xb = b + 4 * (*idx);
1705: x1 = xb[0 + k * bm];
1706: x2 = xb[1 + k * bm];
1707: x3 = xb[2 + k * bm];
1708: x4 = xb[3 + k * bm];
1709: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1710: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1711: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1712: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1713: if (*idx != i) {
1714: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1715: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1716: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1717: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1718: }
1719: v += 16;
1720: ++idx;
1721: }
1722: }
1723: z += 4;
1724: }
1725: PetscFunctionReturn(PETSC_SUCCESS);
1726: }
1728: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1729: {
1730: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1731: PetscScalar *z = c;
1732: const PetscScalar *xb;
1733: PetscScalar x1, x2, x3, x4, x5;
1734: const MatScalar *v = a->a, *vv;
1735: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1737: PetscFunctionBegin;
1738: for (i = 0; i < mbs; i++) {
1739: n = ii[1] - ii[0];
1740: ii++;
1741: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1742: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1743: jj = idx;
1744: vv = v;
1745: for (k = 0; k < cn; k++) {
1746: idx = jj;
1747: v = vv;
1748: for (j = 0; j < n; j++) {
1749: xb = b + 5 * (*idx);
1750: x1 = xb[0 + k * bm];
1751: x2 = xb[1 + k * bm];
1752: x3 = xb[2 + k * bm];
1753: x4 = xb[3 + k * bm];
1754: x5 = xb[4 + k * cm];
1755: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1756: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1757: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1758: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1759: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1760: if (*idx != i) {
1761: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1762: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1763: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1764: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1765: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1766: }
1767: v += 25;
1768: ++idx;
1769: }
1770: }
1771: z += 5;
1772: }
1773: PetscFunctionReturn(PETSC_SUCCESS);
1774: }
1776: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1777: {
1778: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1779: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1780: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1781: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1782: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1783: PetscBLASInt bbs, bcn, bbm, bcm;
1784: PetscScalar *z = NULL;
1785: PetscScalar *c, *b;
1786: const MatScalar *v;
1787: const PetscInt *idx, *ii;
1788: PetscScalar _DOne = 1.0;
1790: PetscFunctionBegin;
1791: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1792: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1793: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1794: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1795: b = bd->v;
1796: PetscCall(MatZeroEntries(C));
1797: PetscCall(MatDenseGetArray(C, &c));
1798: switch (bs) {
1799: case 1:
1800: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1801: break;
1802: case 2:
1803: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1804: break;
1805: case 3:
1806: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1807: break;
1808: case 4:
1809: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1810: break;
1811: case 5:
1812: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1813: break;
1814: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1815: PetscCall(PetscBLASIntCast(bs, &bbs));
1816: PetscCall(PetscBLASIntCast(cn, &bcn));
1817: PetscCall(PetscBLASIntCast(bm, &bbm));
1818: PetscCall(PetscBLASIntCast(cm, &bcm));
1819: idx = a->j;
1820: v = a->a;
1821: mbs = a->mbs;
1822: ii = a->i;
1823: z = c;
1824: for (i = 0; i < mbs; i++) {
1825: n = ii[1] - ii[0];
1826: ii++;
1827: for (j = 0; j < n; j++) {
1828: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1829: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1830: v += bs2;
1831: }
1832: z += bs;
1833: }
1834: }
1835: PetscCall(MatDenseRestoreArray(C, &c));
1836: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1837: PetscFunctionReturn(PETSC_SUCCESS);
1838: }