Actual source code: sbaij2.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <../src/mat/impls/dense/seq/dense.h>
3: #include <../src/mat/impls/sbaij/seq/sbaij.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
9: {
10: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
11: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs;
12: const PetscInt *idx;
13: PetscBT table_out, table_in;
15: PetscFunctionBegin;
16: PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscCall(PetscBTCreate(mbs, &table_out));
22: PetscCall(PetscMalloc1(mbs + 1, &nidx));
23: PetscCall(PetscBTCreate(mbs, &table_in));
25: for (i = 0; i < is_max; i++) { /* for each is */
26: isz = 0;
27: PetscCall(PetscBTMemzero(mbs, table_out));
29: /* Extract the indices, assume there can be duplicate entries */
30: PetscCall(ISGetIndices(is[i], &idx));
31: PetscCall(ISGetLocalSize(is[i], &n));
33: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
34: bcol_max = 0;
35: for (j = 0; j < n; ++j) {
36: brow = idx[j] / bs; /* convert the indices into block indices */
37: PetscCheck(brow < mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
38: if (!PetscBTLookupSet(table_out, brow)) {
39: nidx[isz++] = brow;
40: if (bcol_max < brow) bcol_max = brow;
41: }
42: }
43: PetscCall(ISRestoreIndices(is[i], &idx));
44: PetscCall(ISDestroy(&is[i]));
46: k = 0;
47: for (j = 0; j < ov; j++) { /* for each overlap */
48: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
49: PetscCall(PetscBTMemzero(mbs, table_in));
50: for (l = k; l < isz; l++) PetscCall(PetscBTSet(table_in, nidx[l]));
52: n = isz; /* length of the updated is[i] */
53: for (brow = 0; brow < mbs; brow++) {
54: start = ai[brow];
55: end = ai[brow + 1];
56: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
57: for (l = start; l < end; l++) {
58: bcol = aj[l];
59: if (!PetscBTLookupSet(table_out, bcol)) {
60: nidx[isz++] = bcol;
61: if (bcol_max < bcol) bcol_max = bcol;
62: }
63: }
64: k++;
65: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
66: } else { /* brow is not on nidx - col search: add brow onto nidx if there is a bcol in nidx */
67: for (l = start; l < end; l++) {
68: bcol = aj[l];
69: if (bcol > bcol_max) break;
70: if (PetscBTLookup(table_in, bcol)) {
71: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
72: break; /* for l = start; l<end ; l++) */
73: }
74: }
75: }
76: }
77: } /* for each overlap */
78: PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
79: } /* for each is */
80: PetscCall(PetscBTDestroy(&table_out));
81: PetscCall(PetscFree(nidx));
82: PetscCall(PetscBTDestroy(&table_in));
83: PetscFunctionReturn(PETSC_SUCCESS);
84: }
86: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
87: Zero some ops' to avoid invalid use */
88: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
89: {
90: PetscFunctionBegin;
91: PetscCall(MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE));
92: Bseq->ops->mult = NULL;
93: Bseq->ops->multadd = NULL;
94: Bseq->ops->multtranspose = NULL;
95: Bseq->ops->multtransposeadd = NULL;
96: Bseq->ops->lufactor = NULL;
97: Bseq->ops->choleskyfactor = NULL;
98: Bseq->ops->lufactorsymbolic = NULL;
99: Bseq->ops->choleskyfactorsymbolic = NULL;
100: Bseq->ops->getinertia = NULL;
101: PetscFunctionReturn(PETSC_SUCCESS);
102: }
104: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
105: static PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B, PetscBool sym)
106: {
107: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c = NULL;
108: Mat_SeqBAIJ *d = NULL;
109: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
110: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
111: const PetscInt *irow, *icol;
112: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
113: PetscInt *aj = a->j, *ai = a->i;
114: MatScalar *mat_a;
115: Mat C;
116: PetscBool flag;
118: PetscFunctionBegin;
119: PetscCall(ISGetIndices(isrow, &irow));
120: PetscCall(ISGetIndices(iscol, &icol));
121: PetscCall(ISGetLocalSize(isrow, &nrows));
122: PetscCall(ISGetLocalSize(iscol, &ncols));
124: PetscCall(PetscCalloc1(1 + oldcols, &smap));
125: ssmap = smap;
126: PetscCall(PetscMalloc1(1 + nrows, &lens));
127: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
128: /* determine lens of each row */
129: for (i = 0; i < nrows; i++) {
130: kstart = ai[irow[i]];
131: kend = kstart + a->ilen[irow[i]];
132: lens[i] = 0;
133: for (k = kstart; k < kend; k++) {
134: if (ssmap[aj[k]]) lens[i]++;
135: }
136: }
137: /* Create and fill new matrix */
138: if (scall == MAT_REUSE_MATRIX) {
139: if (sym) {
140: c = (Mat_SeqSBAIJ *)((*B)->data);
142: PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
143: PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
144: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
145: PetscCall(PetscArrayzero(c->ilen, c->mbs));
146: } else {
147: d = (Mat_SeqBAIJ *)((*B)->data);
149: PetscCheck(d->mbs == nrows && d->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
150: PetscCall(PetscArraycmp(d->ilen, lens, d->mbs, &flag));
151: PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
152: PetscCall(PetscArrayzero(d->ilen, d->mbs));
153: }
154: C = *B;
155: } else {
156: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
157: PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
158: if (sym) {
159: PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
160: PetscCall(MatSeqSBAIJSetPreallocation(C, bs, 0, lens));
161: } else {
162: PetscCall(MatSetType(C, MATSEQBAIJ));
163: PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
164: }
165: }
166: if (sym) c = (Mat_SeqSBAIJ *)C->data;
167: else d = (Mat_SeqBAIJ *)C->data;
168: for (i = 0; i < nrows; i++) {
169: row = irow[i];
170: kstart = ai[row];
171: kend = kstart + a->ilen[row];
172: if (sym) {
173: mat_i = c->i[i];
174: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
175: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
176: mat_ilen = c->ilen + i;
177: } else {
178: mat_i = d->i[i];
179: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
180: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
181: mat_ilen = d->ilen + i;
182: }
183: for (k = kstart; k < kend; k++) {
184: if ((tcol = ssmap[a->j[k]])) {
185: *mat_j++ = tcol - 1;
186: PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
187: mat_a += bs2;
188: (*mat_ilen)++;
189: }
190: }
191: }
192: /* sort */
193: {
194: MatScalar *work;
196: PetscCall(PetscMalloc1(bs2, &work));
197: for (i = 0; i < nrows; i++) {
198: PetscInt ilen;
199: if (sym) {
200: mat_i = c->i[i];
201: mat_j = PetscSafePointerPlusOffset(c->j, mat_i);
202: mat_a = PetscSafePointerPlusOffset(c->a, mat_i * bs2);
203: ilen = c->ilen[i];
204: } else {
205: mat_i = d->i[i];
206: mat_j = PetscSafePointerPlusOffset(d->j, mat_i);
207: mat_a = PetscSafePointerPlusOffset(d->a, mat_i * bs2);
208: ilen = d->ilen[i];
209: }
210: PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
211: }
212: PetscCall(PetscFree(work));
213: }
215: /* Free work space */
216: PetscCall(ISRestoreIndices(iscol, &icol));
217: PetscCall(PetscFree(smap));
218: PetscCall(PetscFree(lens));
219: PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
220: PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
222: PetscCall(ISRestoreIndices(isrow, &irow));
223: *B = C;
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
228: {
229: Mat C[2];
230: IS is1, is2, intersect = NULL;
231: PetscInt n1, n2, ni;
232: PetscBool sym = PETSC_TRUE;
234: PetscFunctionBegin;
235: PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
236: if (isrow == iscol) {
237: is2 = is1;
238: PetscCall(PetscObjectReference((PetscObject)is2));
239: } else {
240: PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
241: PetscCall(ISIntersect(is1, is2, &intersect));
242: PetscCall(ISGetLocalSize(intersect, &ni));
243: PetscCall(ISDestroy(&intersect));
244: if (ni == 0) sym = PETSC_FALSE;
245: else if (PetscDefined(USE_DEBUG)) {
246: PetscCall(ISGetLocalSize(is1, &n1));
247: PetscCall(ISGetLocalSize(is2, &n2));
248: PetscCheck(ni == n1 && ni == n2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create such a submatrix");
249: }
250: }
251: if (sym) PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B, sym));
252: else {
253: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, MAT_INITIAL_MATRIX, C, sym));
254: PetscCall(MatCreateSubMatrix_SeqSBAIJ_Private(A, is2, is1, MAT_INITIAL_MATRIX, C + 1, sym));
255: PetscCall(MatTranspose(C[1], MAT_INPLACE_MATRIX, C + 1));
256: PetscCall(MatAXPY(C[0], 1.0, C[1], DIFFERENT_NONZERO_PATTERN));
257: PetscCheck(scall != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
258: if (scall == MAT_REUSE_MATRIX) PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
259: else if (A->rmap->bs == 1) PetscCall(MatConvert(C[0], MATAIJ, MAT_INITIAL_MATRIX, B));
260: else PetscCall(MatCopy(C[0], *B, SAME_NONZERO_PATTERN));
261: PetscCall(MatDestroy(C));
262: PetscCall(MatDestroy(C + 1));
263: }
264: PetscCall(ISDestroy(&is1));
265: PetscCall(ISDestroy(&is2));
267: if (sym && isrow != iscol) {
268: PetscBool isequal;
269: PetscCall(ISEqual(isrow, iscol, &isequal));
270: if (!isequal) PetscCall(MatSeqSBAIJZeroOps_Private(*B));
271: }
272: PetscFunctionReturn(PETSC_SUCCESS);
273: }
275: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
276: {
277: PetscInt i;
279: PetscFunctionBegin;
280: if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n, B));
282: for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
283: PetscFunctionReturn(PETSC_SUCCESS);
284: }
286: /* Should check that shapes of vectors and matrices match */
287: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
288: {
289: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
290: PetscScalar *z, x1, x2, zero = 0.0;
291: const PetscScalar *x, *xb;
292: const MatScalar *v;
293: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
294: const PetscInt *aj = a->j, *ai = a->i, *ib;
295: PetscInt nonzerorow = 0;
297: PetscFunctionBegin;
298: PetscCall(VecSet(zz, zero));
299: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
300: PetscCall(VecGetArrayRead(xx, &x));
301: PetscCall(VecGetArray(zz, &z));
303: v = a->a;
304: xb = x;
306: for (i = 0; i < mbs; i++) {
307: n = ai[1] - ai[0]; /* length of i_th block row of A */
308: x1 = xb[0];
309: x2 = xb[1];
310: ib = aj + *ai;
311: jmin = 0;
312: nonzerorow += (n > 0);
313: if (*ib == i) { /* (diag of A)*x */
314: z[2 * i] += v[0] * x1 + v[2] * x2;
315: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
316: v += 4;
317: jmin++;
318: }
319: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321: for (j = jmin; j < n; j++) {
322: /* (strict lower triangular part of A)*x */
323: cval = ib[j] * 2;
324: z[cval] += v[0] * x1 + v[1] * x2;
325: z[cval + 1] += v[2] * x1 + v[3] * x2;
326: /* (strict upper triangular part of A)*x */
327: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
328: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
329: v += 4;
330: }
331: xb += 2;
332: ai++;
333: }
335: PetscCall(VecRestoreArrayRead(xx, &x));
336: PetscCall(VecRestoreArray(zz, &z));
337: PetscCall(PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
338: PetscFunctionReturn(PETSC_SUCCESS);
339: }
341: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
342: {
343: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
344: PetscScalar *z, x1, x2, x3, zero = 0.0;
345: const PetscScalar *x, *xb;
346: const MatScalar *v;
347: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
348: const PetscInt *aj = a->j, *ai = a->i, *ib;
349: PetscInt nonzerorow = 0;
351: PetscFunctionBegin;
352: PetscCall(VecSet(zz, zero));
353: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
354: PetscCall(VecGetArrayRead(xx, &x));
355: PetscCall(VecGetArray(zz, &z));
357: v = a->a;
358: xb = x;
360: for (i = 0; i < mbs; i++) {
361: n = ai[1] - ai[0]; /* length of i_th block row of A */
362: x1 = xb[0];
363: x2 = xb[1];
364: x3 = xb[2];
365: ib = aj + *ai;
366: jmin = 0;
367: nonzerorow += (n > 0);
368: if (*ib == i) { /* (diag of A)*x */
369: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
370: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
371: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
372: v += 9;
373: jmin++;
374: }
375: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
376: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
377: for (j = jmin; j < n; j++) {
378: /* (strict lower triangular part of A)*x */
379: cval = ib[j] * 3;
380: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
381: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
382: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
383: /* (strict upper triangular part of A)*x */
384: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
385: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
386: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
387: v += 9;
388: }
389: xb += 3;
390: ai++;
391: }
393: PetscCall(VecRestoreArrayRead(xx, &x));
394: PetscCall(VecRestoreArray(zz, &z));
395: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
400: {
401: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
402: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
403: const PetscScalar *x, *xb;
404: const MatScalar *v;
405: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
406: const PetscInt *aj = a->j, *ai = a->i, *ib;
407: PetscInt nonzerorow = 0;
409: PetscFunctionBegin;
410: PetscCall(VecSet(zz, zero));
411: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
412: PetscCall(VecGetArrayRead(xx, &x));
413: PetscCall(VecGetArray(zz, &z));
415: v = a->a;
416: xb = x;
418: for (i = 0; i < mbs; i++) {
419: n = ai[1] - ai[0]; /* length of i_th block row of A */
420: x1 = xb[0];
421: x2 = xb[1];
422: x3 = xb[2];
423: x4 = xb[3];
424: ib = aj + *ai;
425: jmin = 0;
426: nonzerorow += (n > 0);
427: if (*ib == i) { /* (diag of A)*x */
428: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
429: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
430: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
431: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
432: v += 16;
433: jmin++;
434: }
435: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
436: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
437: for (j = jmin; j < n; j++) {
438: /* (strict lower triangular part of A)*x */
439: cval = ib[j] * 4;
440: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
441: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
442: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
443: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
444: /* (strict upper triangular part of A)*x */
445: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
446: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
447: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
448: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
449: v += 16;
450: }
451: xb += 4;
452: ai++;
453: }
455: PetscCall(VecRestoreArrayRead(xx, &x));
456: PetscCall(VecRestoreArray(zz, &z));
457: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
458: PetscFunctionReturn(PETSC_SUCCESS);
459: }
461: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
462: {
463: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
464: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
465: const PetscScalar *x, *xb;
466: const MatScalar *v;
467: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
468: const PetscInt *aj = a->j, *ai = a->i, *ib;
469: PetscInt nonzerorow = 0;
471: PetscFunctionBegin;
472: PetscCall(VecSet(zz, zero));
473: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
474: PetscCall(VecGetArrayRead(xx, &x));
475: PetscCall(VecGetArray(zz, &z));
477: v = a->a;
478: xb = x;
480: for (i = 0; i < mbs; i++) {
481: n = ai[1] - ai[0]; /* length of i_th block row of A */
482: x1 = xb[0];
483: x2 = xb[1];
484: x3 = xb[2];
485: x4 = xb[3];
486: x5 = xb[4];
487: ib = aj + *ai;
488: jmin = 0;
489: nonzerorow += (n > 0);
490: if (*ib == i) { /* (diag of A)*x */
491: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
492: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
493: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
494: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
495: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
496: v += 25;
497: jmin++;
498: }
499: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
500: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
501: for (j = jmin; j < n; j++) {
502: /* (strict lower triangular part of A)*x */
503: cval = ib[j] * 5;
504: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
505: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
506: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
507: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
508: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
509: /* (strict upper triangular part of A)*x */
510: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
511: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
512: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
513: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
514: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
515: v += 25;
516: }
517: xb += 5;
518: ai++;
519: }
521: PetscCall(VecRestoreArrayRead(xx, &x));
522: PetscCall(VecRestoreArray(zz, &z));
523: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
524: PetscFunctionReturn(PETSC_SUCCESS);
525: }
527: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
528: {
529: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
530: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
531: const PetscScalar *x, *xb;
532: const MatScalar *v;
533: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
534: const PetscInt *aj = a->j, *ai = a->i, *ib;
535: PetscInt nonzerorow = 0;
537: PetscFunctionBegin;
538: PetscCall(VecSet(zz, zero));
539: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
540: PetscCall(VecGetArrayRead(xx, &x));
541: PetscCall(VecGetArray(zz, &z));
543: v = a->a;
544: xb = x;
546: for (i = 0; i < mbs; i++) {
547: n = ai[1] - ai[0]; /* length of i_th block row of A */
548: x1 = xb[0];
549: x2 = xb[1];
550: x3 = xb[2];
551: x4 = xb[3];
552: x5 = xb[4];
553: x6 = xb[5];
554: ib = aj + *ai;
555: jmin = 0;
556: nonzerorow += (n > 0);
557: if (*ib == i) { /* (diag of A)*x */
558: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
559: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
560: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
561: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
562: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
563: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
564: v += 36;
565: jmin++;
566: }
567: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
568: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
569: for (j = jmin; j < n; j++) {
570: /* (strict lower triangular part of A)*x */
571: cval = ib[j] * 6;
572: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
573: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
574: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
575: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
576: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
577: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
578: /* (strict upper triangular part of A)*x */
579: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
580: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
581: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
582: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
583: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
584: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
585: v += 36;
586: }
587: xb += 6;
588: ai++;
589: }
591: PetscCall(VecRestoreArrayRead(xx, &x));
592: PetscCall(VecRestoreArray(zz, &z));
593: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
594: PetscFunctionReturn(PETSC_SUCCESS);
595: }
597: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
598: {
599: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
600: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
601: const PetscScalar *x, *xb;
602: const MatScalar *v;
603: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
604: const PetscInt *aj = a->j, *ai = a->i, *ib;
605: PetscInt nonzerorow = 0;
607: PetscFunctionBegin;
608: PetscCall(VecSet(zz, zero));
609: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
610: PetscCall(VecGetArrayRead(xx, &x));
611: PetscCall(VecGetArray(zz, &z));
613: v = a->a;
614: xb = x;
616: for (i = 0; i < mbs; i++) {
617: n = ai[1] - ai[0]; /* length of i_th block row of A */
618: x1 = xb[0];
619: x2 = xb[1];
620: x3 = xb[2];
621: x4 = xb[3];
622: x5 = xb[4];
623: x6 = xb[5];
624: x7 = xb[6];
625: ib = aj + *ai;
626: jmin = 0;
627: nonzerorow += (n > 0);
628: if (*ib == i) { /* (diag of A)*x */
629: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
630: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
631: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
632: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
633: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
634: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
635: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
636: v += 49;
637: jmin++;
638: }
639: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
640: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641: for (j = jmin; j < n; j++) {
642: /* (strict lower triangular part of A)*x */
643: cval = ib[j] * 7;
644: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
645: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
646: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
647: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
648: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
649: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
650: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
651: /* (strict upper triangular part of A)*x */
652: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
653: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
654: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
655: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
656: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
657: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
658: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
659: v += 49;
660: }
661: xb += 7;
662: ai++;
663: }
664: PetscCall(VecRestoreArrayRead(xx, &x));
665: PetscCall(VecRestoreArray(zz, &z));
666: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow));
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: /*
671: This will not work with MatScalar == float because it calls the BLAS
672: */
673: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
674: {
675: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
676: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
677: const PetscScalar *x, *x_ptr, *xb;
678: const MatScalar *v;
679: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
680: const PetscInt *idx, *aj, *ii;
681: PetscInt nonzerorow = 0;
683: PetscFunctionBegin;
684: PetscCall(VecSet(zz, zero));
685: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
686: PetscCall(VecGetArrayRead(xx, &x));
687: PetscCall(VecGetArray(zz, &z));
689: x_ptr = x;
690: z_ptr = z;
692: aj = a->j;
693: v = a->a;
694: ii = a->i;
696: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->N + 1, &a->mult_work));
697: work = a->mult_work;
699: for (i = 0; i < mbs; i++) {
700: n = ii[1] - ii[0];
701: ncols = n * bs;
702: workt = work;
703: idx = aj + ii[0];
704: nonzerorow += (n > 0);
706: /* upper triangular part */
707: for (j = 0; j < n; j++) {
708: xb = x_ptr + bs * (*idx++);
709: for (k = 0; k < bs; k++) workt[k] = xb[k];
710: workt += bs;
711: }
712: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
713: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
715: /* strict lower triangular part */
716: idx = aj + ii[0];
717: if (n && *idx == i) {
718: ncols -= bs;
719: v += bs2;
720: idx++;
721: n--;
722: }
724: if (ncols > 0) {
725: workt = work;
726: PetscCall(PetscArrayzero(workt, ncols));
727: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
728: for (j = 0; j < n; j++) {
729: zb = z_ptr + bs * (*idx++);
730: for (k = 0; k < bs; k++) zb[k] += workt[k];
731: workt += bs;
732: }
733: }
734: x += bs;
735: v += n * bs2;
736: z += bs;
737: ii++;
738: }
740: PetscCall(VecRestoreArrayRead(xx, &x));
741: PetscCall(VecRestoreArray(zz, &z));
742: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow));
743: PetscFunctionReturn(PETSC_SUCCESS);
744: }
746: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
747: {
748: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
749: PetscScalar *z, x1;
750: const PetscScalar *x, *xb;
751: const MatScalar *v;
752: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
753: const PetscInt *aj = a->j, *ai = a->i, *ib;
754: PetscInt nonzerorow = 0;
755: #if defined(PETSC_USE_COMPLEX)
756: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
757: #else
758: const int aconj = 0;
759: #endif
761: PetscFunctionBegin;
762: PetscCall(VecCopy(yy, zz));
763: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
764: PetscCall(VecGetArrayRead(xx, &x));
765: PetscCall(VecGetArray(zz, &z));
766: v = a->a;
767: xb = x;
769: for (i = 0; i < mbs; i++) {
770: n = ai[1] - ai[0]; /* length of i_th row of A */
771: x1 = xb[0];
772: ib = aj + *ai;
773: jmin = 0;
774: nonzerorow += (n > 0);
775: if (n && *ib == i) { /* (diag of A)*x */
776: z[i] += *v++ * x[*ib++];
777: jmin++;
778: }
779: if (aconj) {
780: for (j = jmin; j < n; j++) {
781: cval = *ib;
782: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
783: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
784: }
785: } else {
786: for (j = jmin; j < n; j++) {
787: cval = *ib;
788: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
789: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
790: }
791: }
792: xb++;
793: ai++;
794: }
796: PetscCall(VecRestoreArrayRead(xx, &x));
797: PetscCall(VecRestoreArray(zz, &z));
799: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
800: PetscFunctionReturn(PETSC_SUCCESS);
801: }
803: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
804: {
805: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
806: PetscScalar *z, x1, x2;
807: const PetscScalar *x, *xb;
808: const MatScalar *v;
809: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
810: const PetscInt *aj = a->j, *ai = a->i, *ib;
811: PetscInt nonzerorow = 0;
813: PetscFunctionBegin;
814: PetscCall(VecCopy(yy, zz));
815: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
816: PetscCall(VecGetArrayRead(xx, &x));
817: PetscCall(VecGetArray(zz, &z));
819: v = a->a;
820: xb = x;
822: for (i = 0; i < mbs; i++) {
823: n = ai[1] - ai[0]; /* length of i_th block row of A */
824: x1 = xb[0];
825: x2 = xb[1];
826: ib = aj + *ai;
827: jmin = 0;
828: nonzerorow += (n > 0);
829: if (n && *ib == i) { /* (diag of A)*x */
830: z[2 * i] += v[0] * x1 + v[2] * x2;
831: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
832: v += 4;
833: jmin++;
834: }
835: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
836: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
837: for (j = jmin; j < n; j++) {
838: /* (strict lower triangular part of A)*x */
839: cval = ib[j] * 2;
840: z[cval] += v[0] * x1 + v[1] * x2;
841: z[cval + 1] += v[2] * x1 + v[3] * x2;
842: /* (strict upper triangular part of A)*x */
843: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
844: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
845: v += 4;
846: }
847: xb += 2;
848: ai++;
849: }
850: PetscCall(VecRestoreArrayRead(xx, &x));
851: PetscCall(VecRestoreArray(zz, &z));
853: PetscCall(PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow)));
854: PetscFunctionReturn(PETSC_SUCCESS);
855: }
857: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
858: {
859: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
860: PetscScalar *z, x1, x2, x3;
861: const PetscScalar *x, *xb;
862: const MatScalar *v;
863: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
864: const PetscInt *aj = a->j, *ai = a->i, *ib;
865: PetscInt nonzerorow = 0;
867: PetscFunctionBegin;
868: PetscCall(VecCopy(yy, zz));
869: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
870: PetscCall(VecGetArrayRead(xx, &x));
871: PetscCall(VecGetArray(zz, &z));
873: v = a->a;
874: xb = x;
876: for (i = 0; i < mbs; i++) {
877: n = ai[1] - ai[0]; /* length of i_th block row of A */
878: x1 = xb[0];
879: x2 = xb[1];
880: x3 = xb[2];
881: ib = aj + *ai;
882: jmin = 0;
883: nonzerorow += (n > 0);
884: if (n && *ib == i) { /* (diag of A)*x */
885: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
886: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
887: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
888: v += 9;
889: jmin++;
890: }
891: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
892: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
893: for (j = jmin; j < n; j++) {
894: /* (strict lower triangular part of A)*x */
895: cval = ib[j] * 3;
896: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
897: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
898: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
899: /* (strict upper triangular part of A)*x */
900: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
901: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
902: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
903: v += 9;
904: }
905: xb += 3;
906: ai++;
907: }
909: PetscCall(VecRestoreArrayRead(xx, &x));
910: PetscCall(VecRestoreArray(zz, &z));
912: PetscCall(PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow)));
913: PetscFunctionReturn(PETSC_SUCCESS);
914: }
916: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
917: {
918: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
919: PetscScalar *z, x1, x2, x3, x4;
920: const PetscScalar *x, *xb;
921: const MatScalar *v;
922: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
923: const PetscInt *aj = a->j, *ai = a->i, *ib;
924: PetscInt nonzerorow = 0;
926: PetscFunctionBegin;
927: PetscCall(VecCopy(yy, zz));
928: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
929: PetscCall(VecGetArrayRead(xx, &x));
930: PetscCall(VecGetArray(zz, &z));
932: v = a->a;
933: xb = x;
935: for (i = 0; i < mbs; i++) {
936: n = ai[1] - ai[0]; /* length of i_th block row of A */
937: x1 = xb[0];
938: x2 = xb[1];
939: x3 = xb[2];
940: x4 = xb[3];
941: ib = aj + *ai;
942: jmin = 0;
943: nonzerorow += (n > 0);
944: if (n && *ib == i) { /* (diag of A)*x */
945: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
946: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
947: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
948: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
949: v += 16;
950: jmin++;
951: }
952: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
953: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
954: for (j = jmin; j < n; j++) {
955: /* (strict lower triangular part of A)*x */
956: cval = ib[j] * 4;
957: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
958: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
959: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
960: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
961: /* (strict upper triangular part of A)*x */
962: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
963: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
964: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
965: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
966: v += 16;
967: }
968: xb += 4;
969: ai++;
970: }
972: PetscCall(VecRestoreArrayRead(xx, &x));
973: PetscCall(VecRestoreArray(zz, &z));
975: PetscCall(PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow)));
976: PetscFunctionReturn(PETSC_SUCCESS);
977: }
979: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
980: {
981: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
982: PetscScalar *z, x1, x2, x3, x4, x5;
983: const PetscScalar *x, *xb;
984: const MatScalar *v;
985: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
986: const PetscInt *aj = a->j, *ai = a->i, *ib;
987: PetscInt nonzerorow = 0;
989: PetscFunctionBegin;
990: PetscCall(VecCopy(yy, zz));
991: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
992: PetscCall(VecGetArrayRead(xx, &x));
993: PetscCall(VecGetArray(zz, &z));
995: v = a->a;
996: xb = x;
998: for (i = 0; i < mbs; i++) {
999: n = ai[1] - ai[0]; /* length of i_th block row of A */
1000: x1 = xb[0];
1001: x2 = xb[1];
1002: x3 = xb[2];
1003: x4 = xb[3];
1004: x5 = xb[4];
1005: ib = aj + *ai;
1006: jmin = 0;
1007: nonzerorow += (n > 0);
1008: if (n && *ib == i) { /* (diag of A)*x */
1009: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1010: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1011: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1012: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
1013: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1014: v += 25;
1015: jmin++;
1016: }
1017: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1018: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1019: for (j = jmin; j < n; j++) {
1020: /* (strict lower triangular part of A)*x */
1021: cval = ib[j] * 5;
1022: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
1023: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
1024: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
1025: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
1026: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
1027: /* (strict upper triangular part of A)*x */
1028: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
1029: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
1030: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1031: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1032: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1033: v += 25;
1034: }
1035: xb += 5;
1036: ai++;
1037: }
1039: PetscCall(VecRestoreArrayRead(xx, &x));
1040: PetscCall(VecRestoreArray(zz, &z));
1042: PetscCall(PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow)));
1043: PetscFunctionReturn(PETSC_SUCCESS);
1044: }
1046: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1047: {
1048: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1049: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1050: const PetscScalar *x, *xb;
1051: const MatScalar *v;
1052: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1053: const PetscInt *aj = a->j, *ai = a->i, *ib;
1054: PetscInt nonzerorow = 0;
1056: PetscFunctionBegin;
1057: PetscCall(VecCopy(yy, zz));
1058: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1059: PetscCall(VecGetArrayRead(xx, &x));
1060: PetscCall(VecGetArray(zz, &z));
1062: v = a->a;
1063: xb = x;
1065: for (i = 0; i < mbs; i++) {
1066: n = ai[1] - ai[0]; /* length of i_th block row of A */
1067: x1 = xb[0];
1068: x2 = xb[1];
1069: x3 = xb[2];
1070: x4 = xb[3];
1071: x5 = xb[4];
1072: x6 = xb[5];
1073: ib = aj + *ai;
1074: jmin = 0;
1075: nonzerorow += (n > 0);
1076: if (n && *ib == i) { /* (diag of A)*x */
1077: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1078: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1079: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1080: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1081: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1082: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1083: v += 36;
1084: jmin++;
1085: }
1086: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1087: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088: for (j = jmin; j < n; j++) {
1089: /* (strict lower triangular part of A)*x */
1090: cval = ib[j] * 6;
1091: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1092: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1093: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1094: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1095: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1096: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1097: /* (strict upper triangular part of A)*x */
1098: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1099: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1100: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1101: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1102: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1103: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1104: v += 36;
1105: }
1106: xb += 6;
1107: ai++;
1108: }
1110: PetscCall(VecRestoreArrayRead(xx, &x));
1111: PetscCall(VecRestoreArray(zz, &z));
1113: PetscCall(PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow)));
1114: PetscFunctionReturn(PETSC_SUCCESS);
1115: }
1117: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1118: {
1119: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1120: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1121: const PetscScalar *x, *xb;
1122: const MatScalar *v;
1123: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1124: const PetscInt *aj = a->j, *ai = a->i, *ib;
1125: PetscInt nonzerorow = 0;
1127: PetscFunctionBegin;
1128: PetscCall(VecCopy(yy, zz));
1129: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1130: PetscCall(VecGetArrayRead(xx, &x));
1131: PetscCall(VecGetArray(zz, &z));
1133: v = a->a;
1134: xb = x;
1136: for (i = 0; i < mbs; i++) {
1137: n = ai[1] - ai[0]; /* length of i_th block row of A */
1138: x1 = xb[0];
1139: x2 = xb[1];
1140: x3 = xb[2];
1141: x4 = xb[3];
1142: x5 = xb[4];
1143: x6 = xb[5];
1144: x7 = xb[6];
1145: ib = aj + *ai;
1146: jmin = 0;
1147: nonzerorow += (n > 0);
1148: if (n && *ib == i) { /* (diag of A)*x */
1149: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1150: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1151: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1152: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1153: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1154: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1155: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1156: v += 49;
1157: jmin++;
1158: }
1159: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1160: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1161: for (j = jmin; j < n; j++) {
1162: /* (strict lower triangular part of A)*x */
1163: cval = ib[j] * 7;
1164: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1165: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1166: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1167: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1168: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1169: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1170: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1171: /* (strict upper triangular part of A)*x */
1172: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1173: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1174: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1175: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1176: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1177: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1178: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1179: v += 49;
1180: }
1181: xb += 7;
1182: ai++;
1183: }
1185: PetscCall(VecRestoreArrayRead(xx, &x));
1186: PetscCall(VecRestoreArray(zz, &z));
1188: PetscCall(PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow)));
1189: PetscFunctionReturn(PETSC_SUCCESS);
1190: }
1192: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1193: {
1194: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1195: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1196: const PetscScalar *x, *x_ptr, *xb;
1197: const MatScalar *v;
1198: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1199: const PetscInt *idx, *aj, *ii;
1200: PetscInt nonzerorow = 0;
1202: PetscFunctionBegin;
1203: PetscCall(VecCopy(yy, zz));
1204: if (!a->nz) PetscFunctionReturn(PETSC_SUCCESS);
1205: PetscCall(VecGetArrayRead(xx, &x));
1206: x_ptr = x;
1207: PetscCall(VecGetArray(zz, &z));
1208: z_ptr = z;
1210: aj = a->j;
1211: v = a->a;
1212: ii = a->i;
1214: if (!a->mult_work) PetscCall(PetscMalloc1(A->rmap->n + 1, &a->mult_work));
1215: work = a->mult_work;
1217: for (i = 0; i < mbs; i++) {
1218: n = ii[1] - ii[0];
1219: ncols = n * bs;
1220: workt = work;
1221: idx = aj + ii[0];
1222: nonzerorow += (n > 0);
1224: /* upper triangular part */
1225: for (j = 0; j < n; j++) {
1226: xb = x_ptr + bs * (*idx++);
1227: for (k = 0; k < bs; k++) workt[k] = xb[k];
1228: workt += bs;
1229: }
1230: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1231: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1233: /* strict lower triangular part */
1234: idx = aj + ii[0];
1235: if (n && *idx == i) {
1236: ncols -= bs;
1237: v += bs2;
1238: idx++;
1239: n--;
1240: }
1241: if (ncols > 0) {
1242: workt = work;
1243: PetscCall(PetscArrayzero(workt, ncols));
1244: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1245: for (j = 0; j < n; j++) {
1246: zb = z_ptr + bs * (*idx++);
1247: for (k = 0; k < bs; k++) zb[k] += workt[k];
1248: workt += bs;
1249: }
1250: }
1252: x += bs;
1253: v += n * bs2;
1254: z += bs;
1255: ii++;
1256: }
1258: PetscCall(VecRestoreArrayRead(xx, &x));
1259: PetscCall(VecRestoreArray(zz, &z));
1261: PetscCall(PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow)));
1262: PetscFunctionReturn(PETSC_SUCCESS);
1263: }
1265: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1266: {
1267: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1268: PetscScalar oalpha = alpha;
1269: PetscBLASInt one = 1, totalnz;
1271: PetscFunctionBegin;
1272: PetscCall(PetscBLASIntCast(a->bs2 * a->nz, &totalnz));
1273: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1274: PetscCall(PetscLogFlops(totalnz));
1275: PetscFunctionReturn(PETSC_SUCCESS);
1276: }
1278: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1279: {
1280: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1281: const MatScalar *v = a->a;
1282: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1283: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1284: const PetscInt *aj = a->j, *col;
1286: PetscFunctionBegin;
1287: if (!a->nz) {
1288: *norm = 0.0;
1289: PetscFunctionReturn(PETSC_SUCCESS);
1290: }
1291: if (type == NORM_FROBENIUS) {
1292: for (k = 0; k < mbs; k++) {
1293: jmin = a->i[k];
1294: jmax = a->i[k + 1];
1295: col = aj + jmin;
1296: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1297: for (i = 0; i < bs2; i++) {
1298: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1299: v++;
1300: }
1301: jmin++;
1302: }
1303: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1304: for (i = 0; i < bs2; i++) {
1305: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1306: v++;
1307: }
1308: }
1309: }
1310: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1311: PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1312: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1313: PetscCall(PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl));
1314: for (i = 0; i < mbs; i++) jl[i] = mbs;
1315: il[0] = 0;
1317: *norm = 0.0;
1318: for (k = 0; k < mbs; k++) { /* k_th block row */
1319: for (j = 0; j < bs; j++) sum[j] = 0.0;
1320: /*-- col sum --*/
1321: i = jl[k]; /* first |A(i,k)| to be added */
1322: /* jl[k]=i: first nonzero element in row i for submatrix A(1:k,k:n) (active window)
1323: at step k */
1324: while (i < mbs) {
1325: nexti = jl[i]; /* next block row to be added */
1326: ik = il[i]; /* block index of A(i,k) in the array a */
1327: for (j = 0; j < bs; j++) {
1328: v = a->a + ik * bs2 + j * bs;
1329: for (k1 = 0; k1 < bs; k1++) {
1330: sum[j] += PetscAbsScalar(*v);
1331: v++;
1332: }
1333: }
1334: /* update il, jl */
1335: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1336: jmax = a->i[i + 1];
1337: if (jmin < jmax) {
1338: il[i] = jmin;
1339: j = a->j[jmin];
1340: jl[i] = jl[j];
1341: jl[j] = i;
1342: }
1343: i = nexti;
1344: }
1345: /*-- row sum --*/
1346: jmin = a->i[k];
1347: jmax = a->i[k + 1];
1348: for (i = jmin; i < jmax; i++) {
1349: for (j = 0; j < bs; j++) {
1350: v = a->a + i * bs2 + j;
1351: for (k1 = 0; k1 < bs; k1++) {
1352: sum[j] += PetscAbsScalar(*v);
1353: v += bs;
1354: }
1355: }
1356: }
1357: /* add k_th block row to il, jl */
1358: col = aj + jmin;
1359: if (jmax - jmin > 0 && *col == k) jmin++;
1360: if (jmin < jmax) {
1361: il[k] = jmin;
1362: j = a->j[jmin];
1363: jl[k] = jl[j];
1364: jl[j] = k;
1365: }
1366: for (j = 0; j < bs; j++) {
1367: if (sum[j] > *norm) *norm = sum[j];
1368: }
1369: }
1370: PetscCall(PetscFree3(sum, il, jl));
1371: PetscCall(PetscLogFlops(PetscMax(mbs * a->nz - 1, 0)));
1372: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1373: PetscFunctionReturn(PETSC_SUCCESS);
1374: }
1376: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1377: {
1378: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1380: PetscFunctionBegin;
1381: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1382: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1383: *flg = PETSC_FALSE;
1384: PetscFunctionReturn(PETSC_SUCCESS);
1385: }
1387: /* if the a->i are the same */
1388: PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
1389: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1391: /* if a->j are the same */
1392: PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
1393: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
1395: /* if a->a are the same */
1396: PetscCall(PetscArraycmp(a->a, b->a, a->nz * A->rmap->bs * A->rmap->bs, flg));
1397: PetscFunctionReturn(PETSC_SUCCESS);
1398: }
1400: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1401: {
1402: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1403: PetscInt n;
1404: const PetscInt bs = A->rmap->bs, ambs = a->mbs, bs2 = a->bs2;
1405: PetscScalar *x;
1406: const MatScalar *aa = a->a, *aa_j;
1407: const PetscInt *ai = a->i, *adiag;
1408: PetscBool diagDense;
1410: PetscFunctionBegin;
1411: PetscCheck(!A->factortype || bs <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix with bs>1");
1412: PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, &adiag, &diagDense));
1413: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1414: PetscCall(VecGetArrayWrite(v, &x));
1415: for (PetscInt i = 0; i < ambs; i++) x[i] = 1.0 / aa[adiag[i]];
1416: PetscCall(VecRestoreArrayWrite(v, &x));
1417: PetscFunctionReturn(PETSC_SUCCESS);
1418: }
1420: PetscCall(VecGetLocalSize(v, &n));
1421: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1422: PetscCall(VecGetArrayWrite(v, &x));
1424: if (diagDense) {
1425: for (PetscInt i = 0, row = 0; i < ambs; i++) {
1426: aa_j = aa + adiag[i] * bs2;
1427: for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1428: }
1429: } else {
1430: for (PetscInt i = 0, row = 0; i < ambs; i++) {
1431: const PetscInt j = adiag[i];
1433: if (j != ai[i + 1]) {
1434: aa_j = aa + j * bs2;
1435: for (PetscInt k = 0; k < bs2; k += (bs + 1)) x[row++] = aa_j[k];
1436: } else {
1437: for (PetscInt k = 0; k < bs; k++) x[row++] = 0.0;
1438: }
1439: }
1440: }
1441: PetscCall(VecRestoreArrayWrite(v, &x));
1442: PetscFunctionReturn(PETSC_SUCCESS);
1443: }
1445: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1446: {
1447: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1448: PetscScalar x;
1449: const PetscScalar *l, *li, *ri;
1450: MatScalar *aa, *v;
1451: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1452: const PetscInt *ai, *aj;
1454: PetscFunctionBegin;
1455: if (!ll) PetscFunctionReturn(PETSC_SUCCESS);
1456: ai = a->i;
1457: aj = a->j;
1458: aa = a->a;
1459: m = A->rmap->N;
1460: bs = A->rmap->bs;
1461: mbs = a->mbs;
1462: bs2 = a->bs2;
1464: PetscCall(VecGetArrayRead(ll, &l));
1465: PetscCall(VecGetLocalSize(ll, &lm));
1466: PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1467: for (i = 0; i < mbs; i++) { /* for each block row */
1468: M = ai[i + 1] - ai[i];
1469: li = l + i * bs;
1470: v = aa + bs2 * ai[i];
1471: for (j = 0; j < M; j++) { /* for each block */
1472: ri = l + bs * aj[ai[i] + j];
1473: for (k = 0; k < bs; k++) {
1474: x = ri[k];
1475: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1476: }
1477: }
1478: }
1479: PetscCall(VecRestoreArrayRead(ll, &l));
1480: PetscCall(PetscLogFlops(2.0 * a->nz));
1481: PetscFunctionReturn(PETSC_SUCCESS);
1482: }
1484: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1485: {
1486: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1488: PetscFunctionBegin;
1489: info->block_size = a->bs2;
1490: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1491: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1492: info->nz_unneeded = info->nz_allocated - info->nz_used;
1493: info->assemblies = A->num_ass;
1494: info->mallocs = A->info.mallocs;
1495: info->memory = 0; /* REVIEW ME */
1496: if (A->factortype) {
1497: info->fill_ratio_given = A->info.fill_ratio_given;
1498: info->fill_ratio_needed = A->info.fill_ratio_needed;
1499: info->factor_mallocs = A->info.factor_mallocs;
1500: } else {
1501: info->fill_ratio_given = 0;
1502: info->fill_ratio_needed = 0;
1503: info->factor_mallocs = 0;
1504: }
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1509: {
1510: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1512: PetscFunctionBegin;
1513: PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
1514: PetscFunctionReturn(PETSC_SUCCESS);
1515: }
1517: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1518: {
1519: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1520: PetscInt i, j, n, row, col, bs, mbs;
1521: const PetscInt *ai, *aj;
1522: PetscReal atmp;
1523: const MatScalar *aa;
1524: PetscScalar *x;
1525: PetscInt ncols, brow, bcol, krow, kcol;
1527: PetscFunctionBegin;
1528: PetscCheck(!idx, PETSC_COMM_SELF, PETSC_ERR_SUP, "Send email to petsc-maint@mcs.anl.gov");
1529: PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1530: bs = A->rmap->bs;
1531: aa = a->a;
1532: ai = a->i;
1533: aj = a->j;
1534: mbs = a->mbs;
1536: PetscCall(VecSet(v, 0.0));
1537: PetscCall(VecGetArray(v, &x));
1538: PetscCall(VecGetLocalSize(v, &n));
1539: PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1540: for (i = 0; i < mbs; i++) {
1541: ncols = ai[1] - ai[0];
1542: ai++;
1543: brow = bs * i;
1544: for (j = 0; j < ncols; j++) {
1545: bcol = bs * (*aj);
1546: for (kcol = 0; kcol < bs; kcol++) {
1547: col = bcol + kcol; /* col index */
1548: for (krow = 0; krow < bs; krow++) {
1549: atmp = PetscAbsScalar(*aa);
1550: aa++;
1551: row = brow + krow; /* row index */
1552: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1553: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1554: }
1555: }
1556: aj++;
1557: }
1558: }
1559: PetscCall(VecRestoreArray(v, &x));
1560: PetscFunctionReturn(PETSC_SUCCESS);
1561: }
1563: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1564: {
1565: PetscFunctionBegin;
1566: PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1567: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1568: PetscFunctionReturn(PETSC_SUCCESS);
1569: }
1571: static PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1572: {
1573: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1574: PetscScalar *z = c;
1575: const PetscScalar *xb;
1576: PetscScalar x1;
1577: const MatScalar *v = a->a, *vv;
1578: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1579: #if defined(PETSC_USE_COMPLEX)
1580: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1581: #else
1582: const int aconj = 0;
1583: #endif
1585: PetscFunctionBegin;
1586: for (i = 0; i < mbs; i++) {
1587: n = ii[1] - ii[0];
1588: ii++;
1589: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1590: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1591: jj = idx;
1592: vv = v;
1593: for (k = 0; k < cn; k++) {
1594: idx = jj;
1595: v = vv;
1596: for (j = 0; j < n; j++) {
1597: xb = b + (*idx);
1598: x1 = xb[0 + k * bm];
1599: z[0 + k * cm] += v[0] * x1;
1600: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1601: v += 1;
1602: ++idx;
1603: }
1604: }
1605: z += 1;
1606: }
1607: PetscFunctionReturn(PETSC_SUCCESS);
1608: }
1610: static PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1611: {
1612: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1613: PetscScalar *z = c;
1614: const PetscScalar *xb;
1615: PetscScalar x1, x2;
1616: const MatScalar *v = a->a, *vv;
1617: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1619: PetscFunctionBegin;
1620: for (i = 0; i < mbs; i++) {
1621: n = ii[1] - ii[0];
1622: ii++;
1623: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1624: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1625: jj = idx;
1626: vv = v;
1627: for (k = 0; k < cn; k++) {
1628: idx = jj;
1629: v = vv;
1630: for (j = 0; j < n; j++) {
1631: xb = b + 2 * (*idx);
1632: x1 = xb[0 + k * bm];
1633: x2 = xb[1 + k * bm];
1634: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1635: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1636: if (*idx != i) {
1637: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1638: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1639: }
1640: v += 4;
1641: ++idx;
1642: }
1643: }
1644: z += 2;
1645: }
1646: PetscFunctionReturn(PETSC_SUCCESS);
1647: }
1649: static PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1650: {
1651: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1652: PetscScalar *z = c;
1653: const PetscScalar *xb;
1654: PetscScalar x1, x2, x3;
1655: const MatScalar *v = a->a, *vv;
1656: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1658: PetscFunctionBegin;
1659: for (i = 0; i < mbs; i++) {
1660: n = ii[1] - ii[0];
1661: ii++;
1662: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1663: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1664: jj = idx;
1665: vv = v;
1666: for (k = 0; k < cn; k++) {
1667: idx = jj;
1668: v = vv;
1669: for (j = 0; j < n; j++) {
1670: xb = b + 3 * (*idx);
1671: x1 = xb[0 + k * bm];
1672: x2 = xb[1 + k * bm];
1673: x3 = xb[2 + k * bm];
1674: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1675: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1676: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1677: if (*idx != i) {
1678: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1679: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1680: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1681: }
1682: v += 9;
1683: ++idx;
1684: }
1685: }
1686: z += 3;
1687: }
1688: PetscFunctionReturn(PETSC_SUCCESS);
1689: }
1691: static PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1692: {
1693: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1694: PetscScalar *z = c;
1695: const PetscScalar *xb;
1696: PetscScalar x1, x2, x3, x4;
1697: const MatScalar *v = a->a, *vv;
1698: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1700: PetscFunctionBegin;
1701: for (i = 0; i < mbs; i++) {
1702: n = ii[1] - ii[0];
1703: ii++;
1704: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1705: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1706: jj = idx;
1707: vv = v;
1708: for (k = 0; k < cn; k++) {
1709: idx = jj;
1710: v = vv;
1711: for (j = 0; j < n; j++) {
1712: xb = b + 4 * (*idx);
1713: x1 = xb[0 + k * bm];
1714: x2 = xb[1 + k * bm];
1715: x3 = xb[2 + k * bm];
1716: x4 = xb[3 + k * bm];
1717: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1718: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1719: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1720: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1721: if (*idx != i) {
1722: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1723: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1724: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1725: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1726: }
1727: v += 16;
1728: ++idx;
1729: }
1730: }
1731: z += 4;
1732: }
1733: PetscFunctionReturn(PETSC_SUCCESS);
1734: }
1736: static PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1737: {
1738: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1739: PetscScalar *z = c;
1740: const PetscScalar *xb;
1741: PetscScalar x1, x2, x3, x4, x5;
1742: const MatScalar *v = a->a, *vv;
1743: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1745: PetscFunctionBegin;
1746: for (i = 0; i < mbs; i++) {
1747: n = ii[1] - ii[0];
1748: ii++;
1749: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1750: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1751: jj = idx;
1752: vv = v;
1753: for (k = 0; k < cn; k++) {
1754: idx = jj;
1755: v = vv;
1756: for (j = 0; j < n; j++) {
1757: xb = b + 5 * (*idx);
1758: x1 = xb[0 + k * bm];
1759: x2 = xb[1 + k * bm];
1760: x3 = xb[2 + k * bm];
1761: x4 = xb[3 + k * bm];
1762: x5 = xb[4 + k * cm];
1763: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1764: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1765: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1766: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1767: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1768: if (*idx != i) {
1769: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1770: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1771: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1772: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1773: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1774: }
1775: v += 25;
1776: ++idx;
1777: }
1778: }
1779: z += 5;
1780: }
1781: PetscFunctionReturn(PETSC_SUCCESS);
1782: }
1784: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1785: {
1786: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1787: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1788: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1789: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1790: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1791: PetscBLASInt bbs, bcn, bbm, bcm;
1792: PetscScalar *z = NULL;
1793: PetscScalar *c, *b;
1794: const MatScalar *v;
1795: const PetscInt *idx, *ii;
1796: PetscScalar _DOne = 1.0;
1798: PetscFunctionBegin;
1799: if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1800: PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1801: PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1802: PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1803: b = bd->v;
1804: PetscCall(MatZeroEntries(C));
1805: PetscCall(MatDenseGetArray(C, &c));
1806: switch (bs) {
1807: case 1:
1808: PetscCall(MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn));
1809: break;
1810: case 2:
1811: PetscCall(MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn));
1812: break;
1813: case 3:
1814: PetscCall(MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn));
1815: break;
1816: case 4:
1817: PetscCall(MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn));
1818: break;
1819: case 5:
1820: PetscCall(MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn));
1821: break;
1822: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1823: PetscCall(PetscBLASIntCast(bs, &bbs));
1824: PetscCall(PetscBLASIntCast(cn, &bcn));
1825: PetscCall(PetscBLASIntCast(bm, &bbm));
1826: PetscCall(PetscBLASIntCast(cm, &bcm));
1827: idx = a->j;
1828: v = a->a;
1829: mbs = a->mbs;
1830: ii = a->i;
1831: z = c;
1832: for (i = 0; i < mbs; i++) {
1833: n = ii[1] - ii[0];
1834: ii++;
1835: for (j = 0; j < n; j++) {
1836: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1837: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1838: v += bs2;
1839: }
1840: z += bs;
1841: }
1842: }
1843: PetscCall(MatDenseRestoreArray(C, &c));
1844: PetscCall(PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn));
1845: PetscFunctionReturn(PETSC_SUCCESS);
1846: }