Actual source code: baij2.c

  1: #include <../src/mat/impls/baij/seq/baij.h>
  2: #include <../src/mat/impls/dense/seq/dense.h>
  3: #include <petsc/private/kernels/blockinvert.h>
  4: #include <petscbt.h>
  5: #include <petscblaslapack.h>

  7: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  8:   #include <immintrin.h>
  9: #elif defined(PETSC_HAVE_XMMINTRIN_H)
 10:   #include <xmmintrin.h>
 11: #endif

 13: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
 14: {
 15:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
 16:   PetscInt        row, i, j, k, l, m, n, *nidx, isz, val, ival;
 17:   const PetscInt *idx;
 18:   PetscInt        start, end, *ai, *aj, bs;
 19:   PetscBT         table;

 21:   PetscFunctionBegin;
 22:   m  = a->mbs;
 23:   ai = a->i;
 24:   aj = a->j;
 25:   bs = A->rmap->bs;

 27:   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");

 29:   PetscCall(PetscBTCreate(m, &table));
 30:   PetscCall(PetscMalloc1(m + 1, &nidx));

 32:   for (i = 0; i < is_max; i++) {
 33:     /* Initialise the two local arrays */
 34:     isz = 0;
 35:     PetscCall(PetscBTMemzero(m, table));

 37:     /* Extract the indices, assume there can be duplicate entries */
 38:     PetscCall(ISGetIndices(is[i], &idx));
 39:     PetscCall(ISGetLocalSize(is[i], &n));

 41:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 42:     for (j = 0; j < n; ++j) {
 43:       ival = idx[j] / bs; /* convert the indices into block indices */
 44:       PetscCheck(ival < m, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index greater than mat-dim");
 45:       if (!PetscBTLookupSet(table, ival)) nidx[isz++] = ival;
 46:     }
 47:     PetscCall(ISRestoreIndices(is[i], &idx));
 48:     PetscCall(ISDestroy(&is[i]));

 50:     k = 0;
 51:     for (j = 0; j < ov; j++) { /* for each overlap*/
 52:       n = isz;
 53:       for (; k < n; k++) { /* do only those rows in nidx[k], which are not done yet */
 54:         row   = nidx[k];
 55:         start = ai[row];
 56:         end   = ai[row + 1];
 57:         for (l = start; l < end; l++) {
 58:           val = aj[l];
 59:           if (!PetscBTLookupSet(table, val)) nidx[isz++] = val;
 60:         }
 61:       }
 62:     }
 63:     PetscCall(ISCreateBlock(PETSC_COMM_SELF, bs, isz, nidx, PETSC_COPY_VALUES, is + i));
 64:   }
 65:   PetscCall(PetscBTDestroy(&table));
 66:   PetscCall(PetscFree(nidx));
 67:   PetscFunctionReturn(PETSC_SUCCESS);
 68: }

 70: static PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
 71: {
 72:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *c;
 73:   PetscInt       *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
 74:   PetscInt        row, mat_i, *mat_j, tcol, *mat_ilen;
 75:   const PetscInt *irow, *icol;
 76:   PetscInt        nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
 77:   PetscInt       *aj = a->j, *ai = a->i;
 78:   MatScalar      *mat_a;
 79:   Mat             C;
 80:   PetscBool       flag;

 82:   PetscFunctionBegin;
 83:   PetscCall(ISGetIndices(isrow, &irow));
 84:   PetscCall(ISGetIndices(iscol, &icol));
 85:   PetscCall(ISGetLocalSize(isrow, &nrows));
 86:   PetscCall(ISGetLocalSize(iscol, &ncols));

 88:   PetscCall(PetscCalloc1(1 + oldcols, &smap));
 89:   ssmap = smap;
 90:   PetscCall(PetscMalloc1(1 + nrows, &lens));
 91:   for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
 92:   /* determine lens of each row */
 93:   for (i = 0; i < nrows; i++) {
 94:     kstart  = ai[irow[i]];
 95:     kend    = kstart + a->ilen[irow[i]];
 96:     lens[i] = 0;
 97:     for (k = kstart; k < kend; k++) {
 98:       if (ssmap[aj[k]]) lens[i]++;
 99:     }
100:   }
101:   /* Create and fill new matrix */
102:   if (scall == MAT_REUSE_MATRIX) {
103:     c = (Mat_SeqBAIJ *)((*B)->data);

105:     PetscCheck(c->mbs == nrows && c->nbs == ncols && (*B)->rmap->bs == bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Submatrix wrong size");
106:     PetscCall(PetscArraycmp(c->ilen, lens, c->mbs, &flag));
107:     PetscCheck(flag, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Cannot reuse matrix. wrong number of nonzeros");
108:     PetscCall(PetscArrayzero(c->ilen, c->mbs));
109:     C = *B;
110:   } else {
111:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
112:     PetscCall(MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE));
113:     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
114:     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, lens));
115:   }
116:   c = (Mat_SeqBAIJ *)(C->data);
117:   for (i = 0; i < nrows; i++) {
118:     row      = irow[i];
119:     kstart   = ai[row];
120:     kend     = kstart + a->ilen[row];
121:     mat_i    = c->i[i];
122:     mat_j    = c->j ? c->j + mat_i : NULL;       /* mustn't add to NULL, that is UB */
123:     mat_a    = c->a ? c->a + mat_i * bs2 : NULL; /* mustn't add to NULL, that is UB */
124:     mat_ilen = c->ilen + i;
125:     for (k = kstart; k < kend; k++) {
126:       if ((tcol = ssmap[a->j[k]])) {
127:         *mat_j++ = tcol - 1;
128:         PetscCall(PetscArraycpy(mat_a, a->a + k * bs2, bs2));
129:         mat_a += bs2;
130:         (*mat_ilen)++;
131:       }
132:     }
133:   }
134:   /* sort */
135:   if (c->j && c->a) {
136:     MatScalar *work;
137:     PetscCall(PetscMalloc1(bs2, &work));
138:     for (i = 0; i < nrows; i++) {
139:       PetscInt ilen;
140:       mat_i = c->i[i];
141:       mat_j = c->j + mat_i;
142:       mat_a = c->a + mat_i * bs2;
143:       ilen  = c->ilen[i];
144:       PetscCall(PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work));
145:     }
146:     PetscCall(PetscFree(work));
147:   }

149:   /* Free work space */
150:   PetscCall(ISRestoreIndices(iscol, &icol));
151:   PetscCall(PetscFree(smap));
152:   PetscCall(PetscFree(lens));
153:   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
154:   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));

156:   PetscCall(ISRestoreIndices(isrow, &irow));
157:   *B = C;
158:   PetscFunctionReturn(PETSC_SUCCESS);
159: }

161: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
162: {
163:   IS is1, is2;

165:   PetscFunctionBegin;
166:   PetscCall(ISCompressIndicesGeneral(A->rmap->N, A->rmap->n, A->rmap->bs, 1, &isrow, &is1));
167:   if (isrow == iscol) {
168:     is2 = is1;
169:     PetscCall(PetscObjectReference((PetscObject)is2));
170:   } else PetscCall(ISCompressIndicesGeneral(A->cmap->N, A->cmap->n, A->cmap->bs, 1, &iscol, &is2));
171:   PetscCall(MatCreateSubMatrix_SeqBAIJ_Private(A, is1, is2, scall, B));
172:   PetscCall(ISDestroy(&is1));
173:   PetscCall(ISDestroy(&is2));
174:   PetscFunctionReturn(PETSC_SUCCESS);
175: }

177: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
178: {
179:   Mat_SeqBAIJ *c       = (Mat_SeqBAIJ *)C->data;
180:   Mat_SubSppt *submatj = c->submatis1;

182:   PetscFunctionBegin;
183:   PetscCall((*submatj->destroy)(C));
184:   PetscCall(MatDestroySubMatrix_Private(submatj));
185:   PetscFunctionReturn(PETSC_SUCCESS);
186: }

188: /* Note this has code duplication with MatDestroySubMatrices_SeqAIJ() */
189: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n, Mat *mat[])
190: {
191:   PetscInt     i;
192:   Mat          C;
193:   Mat_SeqBAIJ *c;
194:   Mat_SubSppt *submatj;

196:   PetscFunctionBegin;
197:   for (i = 0; i < n; i++) {
198:     C       = (*mat)[i];
199:     c       = (Mat_SeqBAIJ *)C->data;
200:     submatj = c->submatis1;
201:     if (submatj) {
202:       if (--((PetscObject)C)->refct <= 0) {
203:         PetscCall(PetscFree(C->factorprefix));
204:         PetscCall((*submatj->destroy)(C));
205:         PetscCall(MatDestroySubMatrix_Private(submatj));
206:         PetscCall(PetscFree(C->defaultvectype));
207:         PetscCall(PetscFree(C->defaultrandtype));
208:         PetscCall(PetscLayoutDestroy(&C->rmap));
209:         PetscCall(PetscLayoutDestroy(&C->cmap));
210:         PetscCall(PetscHeaderDestroy(&C));
211:       }
212:     } else {
213:       PetscCall(MatDestroy(&C));
214:     }
215:   }

217:   /* Destroy Dummy submatrices created for reuse */
218:   PetscCall(MatDestroySubMatrices_Dummy(n, mat));

220:   PetscCall(PetscFree(*mat));
221:   PetscFunctionReturn(PETSC_SUCCESS);
222: }

224: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
225: {
226:   PetscInt i;

228:   PetscFunctionBegin;
229:   if (scall == MAT_INITIAL_MATRIX) PetscCall(PetscCalloc1(n + 1, B));

231:   for (i = 0; i < n; i++) PetscCall(MatCreateSubMatrix_SeqBAIJ(A, irow[i], icol[i], scall, &(*B)[i]));
232:   PetscFunctionReturn(PETSC_SUCCESS);
233: }

235: /* Should check that shapes of vectors and matrices match */
236: PetscErrorCode MatMult_SeqBAIJ_1(Mat A, Vec xx, Vec zz)
237: {
238:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
239:   PetscScalar       *z, sum;
240:   const PetscScalar *x;
241:   const MatScalar   *v;
242:   PetscInt           mbs, i, n;
243:   const PetscInt    *idx, *ii, *ridx = NULL;
244:   PetscBool          usecprow = a->compressedrow.use;

246:   PetscFunctionBegin;
247:   PetscCall(VecGetArrayRead(xx, &x));
248:   PetscCall(VecGetArrayWrite(zz, &z));

250:   if (usecprow) {
251:     mbs  = a->compressedrow.nrows;
252:     ii   = a->compressedrow.i;
253:     ridx = a->compressedrow.rindex;
254:     PetscCall(PetscArrayzero(z, a->mbs));
255:   } else {
256:     mbs = a->mbs;
257:     ii  = a->i;
258:   }

260:   for (i = 0; i < mbs; i++) {
261:     n   = ii[1] - ii[0];
262:     v   = a->a + ii[0];
263:     idx = a->j + ii[0];
264:     ii++;
265:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
266:     PetscPrefetchBlock(v + 1 * n, 1 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
267:     sum = 0.0;
268:     PetscSparseDensePlusDot(sum, x, v, idx, n);
269:     if (usecprow) {
270:       z[ridx[i]] = sum;
271:     } else {
272:       z[i] = sum;
273:     }
274:   }
275:   PetscCall(VecRestoreArrayRead(xx, &x));
276:   PetscCall(VecRestoreArrayWrite(zz, &z));
277:   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt));
278:   PetscFunctionReturn(PETSC_SUCCESS);
279: }

281: PetscErrorCode MatMult_SeqBAIJ_2(Mat A, Vec xx, Vec zz)
282: {
283:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
284:   PetscScalar       *z = NULL, sum1, sum2, *zarray;
285:   const PetscScalar *x, *xb;
286:   PetscScalar        x1, x2;
287:   const MatScalar   *v;
288:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
289:   PetscBool          usecprow = a->compressedrow.use;

291:   PetscFunctionBegin;
292:   PetscCall(VecGetArrayRead(xx, &x));
293:   PetscCall(VecGetArrayWrite(zz, &zarray));

295:   idx = a->j;
296:   v   = a->a;
297:   if (usecprow) {
298:     mbs  = a->compressedrow.nrows;
299:     ii   = a->compressedrow.i;
300:     ridx = a->compressedrow.rindex;
301:     PetscCall(PetscArrayzero(zarray, 2 * a->mbs));
302:   } else {
303:     mbs = a->mbs;
304:     ii  = a->i;
305:     z   = zarray;
306:   }

308:   for (i = 0; i < mbs; i++) {
309:     n = ii[1] - ii[0];
310:     ii++;
311:     sum1 = 0.0;
312:     sum2 = 0.0;
313:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
314:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
315:     for (j = 0; j < n; j++) {
316:       xb = x + 2 * (*idx++);
317:       x1 = xb[0];
318:       x2 = xb[1];
319:       sum1 += v[0] * x1 + v[2] * x2;
320:       sum2 += v[1] * x1 + v[3] * x2;
321:       v += 4;
322:     }
323:     if (usecprow) z = zarray + 2 * ridx[i];
324:     z[0] = sum1;
325:     z[1] = sum2;
326:     if (!usecprow) z += 2;
327:   }
328:   PetscCall(VecRestoreArrayRead(xx, &x));
329:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
330:   PetscCall(PetscLogFlops(8.0 * a->nz - 2.0 * a->nonzerorowcnt));
331:   PetscFunctionReturn(PETSC_SUCCESS);
332: }

334: PetscErrorCode MatMult_SeqBAIJ_3(Mat A, Vec xx, Vec zz)
335: {
336:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
337:   PetscScalar       *z = NULL, sum1, sum2, sum3, x1, x2, x3, *zarray;
338:   const PetscScalar *x, *xb;
339:   const MatScalar   *v;
340:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
341:   PetscBool          usecprow = a->compressedrow.use;

343: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
344:   #pragma disjoint(*v, *z, *xb)
345: #endif

347:   PetscFunctionBegin;
348:   PetscCall(VecGetArrayRead(xx, &x));
349:   PetscCall(VecGetArrayWrite(zz, &zarray));

351:   idx = a->j;
352:   v   = a->a;
353:   if (usecprow) {
354:     mbs  = a->compressedrow.nrows;
355:     ii   = a->compressedrow.i;
356:     ridx = a->compressedrow.rindex;
357:     PetscCall(PetscArrayzero(zarray, 3 * a->mbs));
358:   } else {
359:     mbs = a->mbs;
360:     ii  = a->i;
361:     z   = zarray;
362:   }

364:   for (i = 0; i < mbs; i++) {
365:     n = ii[1] - ii[0];
366:     ii++;
367:     sum1 = 0.0;
368:     sum2 = 0.0;
369:     sum3 = 0.0;
370:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
371:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
372:     for (j = 0; j < n; j++) {
373:       xb = x + 3 * (*idx++);
374:       x1 = xb[0];
375:       x2 = xb[1];
376:       x3 = xb[2];

378:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
379:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
380:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
381:       v += 9;
382:     }
383:     if (usecprow) z = zarray + 3 * ridx[i];
384:     z[0] = sum1;
385:     z[1] = sum2;
386:     z[2] = sum3;
387:     if (!usecprow) z += 3;
388:   }
389:   PetscCall(VecRestoreArrayRead(xx, &x));
390:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
391:   PetscCall(PetscLogFlops(18.0 * a->nz - 3.0 * a->nonzerorowcnt));
392:   PetscFunctionReturn(PETSC_SUCCESS);
393: }

395: PetscErrorCode MatMult_SeqBAIJ_4(Mat A, Vec xx, Vec zz)
396: {
397:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
398:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *zarray;
399:   const PetscScalar *x, *xb;
400:   const MatScalar   *v;
401:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
402:   PetscBool          usecprow = a->compressedrow.use;

404:   PetscFunctionBegin;
405:   PetscCall(VecGetArrayRead(xx, &x));
406:   PetscCall(VecGetArrayWrite(zz, &zarray));

408:   idx = a->j;
409:   v   = a->a;
410:   if (usecprow) {
411:     mbs  = a->compressedrow.nrows;
412:     ii   = a->compressedrow.i;
413:     ridx = a->compressedrow.rindex;
414:     PetscCall(PetscArrayzero(zarray, 4 * a->mbs));
415:   } else {
416:     mbs = a->mbs;
417:     ii  = a->i;
418:     z   = zarray;
419:   }

421:   for (i = 0; i < mbs; i++) {
422:     n = ii[1] - ii[0];
423:     ii++;
424:     sum1 = 0.0;
425:     sum2 = 0.0;
426:     sum3 = 0.0;
427:     sum4 = 0.0;

429:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
430:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
431:     for (j = 0; j < n; j++) {
432:       xb = x + 4 * (*idx++);
433:       x1 = xb[0];
434:       x2 = xb[1];
435:       x3 = xb[2];
436:       x4 = xb[3];
437:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
438:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
439:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
440:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
441:       v += 16;
442:     }
443:     if (usecprow) z = zarray + 4 * ridx[i];
444:     z[0] = sum1;
445:     z[1] = sum2;
446:     z[2] = sum3;
447:     z[3] = sum4;
448:     if (!usecprow) z += 4;
449:   }
450:   PetscCall(VecRestoreArrayRead(xx, &x));
451:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
452:   PetscCall(PetscLogFlops(32.0 * a->nz - 4.0 * a->nonzerorowcnt));
453:   PetscFunctionReturn(PETSC_SUCCESS);
454: }

456: PetscErrorCode MatMult_SeqBAIJ_5(Mat A, Vec xx, Vec zz)
457: {
458:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
459:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5, *zarray;
460:   const PetscScalar *xb, *x;
461:   const MatScalar   *v;
462:   const PetscInt    *idx, *ii, *ridx = NULL;
463:   PetscInt           mbs, i, j, n;
464:   PetscBool          usecprow = a->compressedrow.use;

466:   PetscFunctionBegin;
467:   PetscCall(VecGetArrayRead(xx, &x));
468:   PetscCall(VecGetArrayWrite(zz, &zarray));

470:   idx = a->j;
471:   v   = a->a;
472:   if (usecprow) {
473:     mbs  = a->compressedrow.nrows;
474:     ii   = a->compressedrow.i;
475:     ridx = a->compressedrow.rindex;
476:     PetscCall(PetscArrayzero(zarray, 5 * a->mbs));
477:   } else {
478:     mbs = a->mbs;
479:     ii  = a->i;
480:     z   = zarray;
481:   }

483:   for (i = 0; i < mbs; i++) {
484:     n = ii[1] - ii[0];
485:     ii++;
486:     sum1 = 0.0;
487:     sum2 = 0.0;
488:     sum3 = 0.0;
489:     sum4 = 0.0;
490:     sum5 = 0.0;
491:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
492:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
493:     for (j = 0; j < n; j++) {
494:       xb = x + 5 * (*idx++);
495:       x1 = xb[0];
496:       x2 = xb[1];
497:       x3 = xb[2];
498:       x4 = xb[3];
499:       x5 = xb[4];
500:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
501:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
502:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
503:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
504:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
505:       v += 25;
506:     }
507:     if (usecprow) z = zarray + 5 * ridx[i];
508:     z[0] = sum1;
509:     z[1] = sum2;
510:     z[2] = sum3;
511:     z[3] = sum4;
512:     z[4] = sum5;
513:     if (!usecprow) z += 5;
514:   }
515:   PetscCall(VecRestoreArrayRead(xx, &x));
516:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
517:   PetscCall(PetscLogFlops(50.0 * a->nz - 5.0 * a->nonzerorowcnt));
518:   PetscFunctionReturn(PETSC_SUCCESS);
519: }

521: PetscErrorCode MatMult_SeqBAIJ_6(Mat A, Vec xx, Vec zz)
522: {
523:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
524:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
525:   const PetscScalar *x, *xb;
526:   PetscScalar        x1, x2, x3, x4, x5, x6, *zarray;
527:   const MatScalar   *v;
528:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
529:   PetscBool          usecprow = a->compressedrow.use;

531:   PetscFunctionBegin;
532:   PetscCall(VecGetArrayRead(xx, &x));
533:   PetscCall(VecGetArrayWrite(zz, &zarray));

535:   idx = a->j;
536:   v   = a->a;
537:   if (usecprow) {
538:     mbs  = a->compressedrow.nrows;
539:     ii   = a->compressedrow.i;
540:     ridx = a->compressedrow.rindex;
541:     PetscCall(PetscArrayzero(zarray, 6 * a->mbs));
542:   } else {
543:     mbs = a->mbs;
544:     ii  = a->i;
545:     z   = zarray;
546:   }

548:   for (i = 0; i < mbs; i++) {
549:     n = ii[1] - ii[0];
550:     ii++;
551:     sum1 = 0.0;
552:     sum2 = 0.0;
553:     sum3 = 0.0;
554:     sum4 = 0.0;
555:     sum5 = 0.0;
556:     sum6 = 0.0;

558:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
559:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
560:     for (j = 0; j < n; j++) {
561:       xb = x + 6 * (*idx++);
562:       x1 = xb[0];
563:       x2 = xb[1];
564:       x3 = xb[2];
565:       x4 = xb[3];
566:       x5 = xb[4];
567:       x6 = xb[5];
568:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
569:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
570:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
571:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
572:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
573:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
574:       v += 36;
575:     }
576:     if (usecprow) z = zarray + 6 * ridx[i];
577:     z[0] = sum1;
578:     z[1] = sum2;
579:     z[2] = sum3;
580:     z[3] = sum4;
581:     z[4] = sum5;
582:     z[5] = sum6;
583:     if (!usecprow) z += 6;
584:   }

586:   PetscCall(VecRestoreArrayRead(xx, &x));
587:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
588:   PetscCall(PetscLogFlops(72.0 * a->nz - 6.0 * a->nonzerorowcnt));
589:   PetscFunctionReturn(PETSC_SUCCESS);
590: }

592: PetscErrorCode MatMult_SeqBAIJ_7(Mat A, Vec xx, Vec zz)
593: {
594:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
595:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
596:   const PetscScalar *x, *xb;
597:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *zarray;
598:   const MatScalar   *v;
599:   PetscInt           mbs, i, *idx, *ii, j, n, *ridx = NULL;
600:   PetscBool          usecprow = a->compressedrow.use;

602:   PetscFunctionBegin;
603:   PetscCall(VecGetArrayRead(xx, &x));
604:   PetscCall(VecGetArrayWrite(zz, &zarray));

606:   idx = a->j;
607:   v   = a->a;
608:   if (usecprow) {
609:     mbs  = a->compressedrow.nrows;
610:     ii   = a->compressedrow.i;
611:     ridx = a->compressedrow.rindex;
612:     PetscCall(PetscArrayzero(zarray, 7 * a->mbs));
613:   } else {
614:     mbs = a->mbs;
615:     ii  = a->i;
616:     z   = zarray;
617:   }

619:   for (i = 0; i < mbs; i++) {
620:     n = ii[1] - ii[0];
621:     ii++;
622:     sum1 = 0.0;
623:     sum2 = 0.0;
624:     sum3 = 0.0;
625:     sum4 = 0.0;
626:     sum5 = 0.0;
627:     sum6 = 0.0;
628:     sum7 = 0.0;

630:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
631:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
632:     for (j = 0; j < n; j++) {
633:       xb = x + 7 * (*idx++);
634:       x1 = xb[0];
635:       x2 = xb[1];
636:       x3 = xb[2];
637:       x4 = xb[3];
638:       x5 = xb[4];
639:       x6 = xb[5];
640:       x7 = xb[6];
641:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
642:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
643:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
644:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
645:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
646:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
647:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
648:       v += 49;
649:     }
650:     if (usecprow) z = zarray + 7 * ridx[i];
651:     z[0] = sum1;
652:     z[1] = sum2;
653:     z[2] = sum3;
654:     z[3] = sum4;
655:     z[4] = sum5;
656:     z[5] = sum6;
657:     z[6] = sum7;
658:     if (!usecprow) z += 7;
659:   }

661:   PetscCall(VecRestoreArrayRead(xx, &x));
662:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
663:   PetscCall(PetscLogFlops(98.0 * a->nz - 7.0 * a->nonzerorowcnt));
664:   PetscFunctionReturn(PETSC_SUCCESS);
665: }

667: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
668: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec zz)
669: {
670:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
671:   PetscScalar       *z = NULL, *work, *workt, *zarray;
672:   const PetscScalar *x, *xb;
673:   const MatScalar   *v;
674:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
675:   const PetscInt    *idx, *ii, *ridx = NULL;
676:   PetscInt           k;
677:   PetscBool          usecprow = a->compressedrow.use;

679:   __m256d a0, a1, a2, a3, a4, a5;
680:   __m256d w0, w1, w2, w3;
681:   __m256d z0, z1, z2;
682:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

684:   PetscFunctionBegin;
685:   PetscCall(VecGetArrayRead(xx, &x));
686:   PetscCall(VecGetArrayWrite(zz, &zarray));

688:   idx = a->j;
689:   v   = a->a;
690:   if (usecprow) {
691:     mbs  = a->compressedrow.nrows;
692:     ii   = a->compressedrow.i;
693:     ridx = a->compressedrow.rindex;
694:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
695:   } else {
696:     mbs = a->mbs;
697:     ii  = a->i;
698:     z   = zarray;
699:   }

701:   if (!a->mult_work) {
702:     k = PetscMax(A->rmap->n, A->cmap->n);
703:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
704:   }

706:   work = a->mult_work;
707:   for (i = 0; i < mbs; i++) {
708:     n = ii[1] - ii[0];
709:     ii++;
710:     workt = work;
711:     for (j = 0; j < n; j++) {
712:       xb = x + bs * (*idx++);
713:       for (k = 0; k < bs; k++) workt[k] = xb[k];
714:       workt += bs;
715:     }
716:     if (usecprow) z = zarray + bs * ridx[i];

718:     z0 = _mm256_setzero_pd();
719:     z1 = _mm256_setzero_pd();
720:     z2 = _mm256_setzero_pd();

722:     for (j = 0; j < n; j++) {
723:       /* first column of a */
724:       w0 = _mm256_set1_pd(work[j * 9]);
725:       a0 = _mm256_loadu_pd(&v[j * 81]);
726:       z0 = _mm256_fmadd_pd(a0, w0, z0);
727:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
728:       z1 = _mm256_fmadd_pd(a1, w0, z1);
729:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
730:       z2 = _mm256_fmadd_pd(a2, w0, z2);

732:       /* second column of a */
733:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
734:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
735:       z0 = _mm256_fmadd_pd(a0, w1, z0);
736:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
737:       z1 = _mm256_fmadd_pd(a1, w1, z1);
738:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
739:       z2 = _mm256_fmadd_pd(a2, w1, z2);

741:       /* third column of a */
742:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
743:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
744:       z0 = _mm256_fmadd_pd(a3, w2, z0);
745:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
746:       z1 = _mm256_fmadd_pd(a4, w2, z1);
747:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
748:       z2 = _mm256_fmadd_pd(a5, w2, z2);

750:       /* fourth column of a */
751:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
752:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
753:       z0 = _mm256_fmadd_pd(a0, w3, z0);
754:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
755:       z1 = _mm256_fmadd_pd(a1, w3, z1);
756:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
757:       z2 = _mm256_fmadd_pd(a2, w3, z2);

759:       /* fifth column of a */
760:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
761:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
762:       z0 = _mm256_fmadd_pd(a3, w0, z0);
763:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
764:       z1 = _mm256_fmadd_pd(a4, w0, z1);
765:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
766:       z2 = _mm256_fmadd_pd(a5, w0, z2);

768:       /* sixth column of a */
769:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
770:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
771:       z0 = _mm256_fmadd_pd(a0, w1, z0);
772:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
773:       z1 = _mm256_fmadd_pd(a1, w1, z1);
774:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
775:       z2 = _mm256_fmadd_pd(a2, w1, z2);

777:       /* seventh column of a */
778:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
779:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
780:       z0 = _mm256_fmadd_pd(a0, w2, z0);
781:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
782:       z1 = _mm256_fmadd_pd(a1, w2, z1);
783:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
784:       z2 = _mm256_fmadd_pd(a2, w2, z2);

786:       /* eighth column of a */
787:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
788:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
789:       z0 = _mm256_fmadd_pd(a3, w3, z0);
790:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
791:       z1 = _mm256_fmadd_pd(a4, w3, z1);
792:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
793:       z2 = _mm256_fmadd_pd(a5, w3, z2);

795:       /* ninth column of a */
796:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
797:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
798:       z0 = _mm256_fmadd_pd(a0, w0, z0);
799:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
800:       z1 = _mm256_fmadd_pd(a1, w0, z1);
801:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
802:       z2 = _mm256_fmadd_pd(a2, w0, z2);
803:     }

805:     _mm256_storeu_pd(&z[0], z0);
806:     _mm256_storeu_pd(&z[4], z1);
807:     _mm256_maskstore_pd(&z[8], mask1, z2);

809:     v += n * bs2;
810:     if (!usecprow) z += bs;
811:   }
812:   PetscCall(VecRestoreArrayRead(xx, &x));
813:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
814:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
815:   PetscFunctionReturn(PETSC_SUCCESS);
816: }
817: #endif

819: PetscErrorCode MatMult_SeqBAIJ_11(Mat A, Vec xx, Vec zz)
820: {
821:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
822:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
823:   const PetscScalar *x, *xb;
824:   PetscScalar       *zarray, xv;
825:   const MatScalar   *v;
826:   const PetscInt    *ii, *ij = a->j, *idx;
827:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
828:   PetscBool          usecprow = a->compressedrow.use;

830:   PetscFunctionBegin;
831:   PetscCall(VecGetArrayRead(xx, &x));
832:   PetscCall(VecGetArrayWrite(zz, &zarray));

834:   v = a->a;
835:   if (usecprow) {
836:     mbs  = a->compressedrow.nrows;
837:     ii   = a->compressedrow.i;
838:     ridx = a->compressedrow.rindex;
839:     PetscCall(PetscArrayzero(zarray, 11 * a->mbs));
840:   } else {
841:     mbs = a->mbs;
842:     ii  = a->i;
843:     z   = zarray;
844:   }

846:   for (i = 0; i < mbs; i++) {
847:     n     = ii[i + 1] - ii[i];
848:     idx   = ij + ii[i];
849:     sum1  = 0.0;
850:     sum2  = 0.0;
851:     sum3  = 0.0;
852:     sum4  = 0.0;
853:     sum5  = 0.0;
854:     sum6  = 0.0;
855:     sum7  = 0.0;
856:     sum8  = 0.0;
857:     sum9  = 0.0;
858:     sum10 = 0.0;
859:     sum11 = 0.0;

861:     for (j = 0; j < n; j++) {
862:       xb = x + 11 * (idx[j]);

864:       for (k = 0; k < 11; k++) {
865:         xv = xb[k];
866:         sum1 += v[0] * xv;
867:         sum2 += v[1] * xv;
868:         sum3 += v[2] * xv;
869:         sum4 += v[3] * xv;
870:         sum5 += v[4] * xv;
871:         sum6 += v[5] * xv;
872:         sum7 += v[6] * xv;
873:         sum8 += v[7] * xv;
874:         sum9 += v[8] * xv;
875:         sum10 += v[9] * xv;
876:         sum11 += v[10] * xv;
877:         v += 11;
878:       }
879:     }
880:     if (usecprow) z = zarray + 11 * ridx[i];
881:     z[0]  = sum1;
882:     z[1]  = sum2;
883:     z[2]  = sum3;
884:     z[3]  = sum4;
885:     z[4]  = sum5;
886:     z[5]  = sum6;
887:     z[6]  = sum7;
888:     z[7]  = sum8;
889:     z[8]  = sum9;
890:     z[9]  = sum10;
891:     z[10] = sum11;

893:     if (!usecprow) z += 11;
894:   }

896:   PetscCall(VecRestoreArrayRead(xx, &x));
897:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
898:   PetscCall(PetscLogFlops(242.0 * a->nz - 11.0 * a->nonzerorowcnt));
899:   PetscFunctionReturn(PETSC_SUCCESS);
900: }

902: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
903: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec zz)
904: {
905:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
906:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
907:   const PetscScalar *x, *xb;
908:   PetscScalar       *zarray, xv;
909:   const MatScalar   *v;
910:   const PetscInt    *ii, *ij = a->j, *idx;
911:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
912:   PetscBool          usecprow = a->compressedrow.use;

914:   PetscFunctionBegin;
915:   PetscCall(VecGetArrayRead(xx, &x));
916:   PetscCall(VecGetArrayWrite(zz, &zarray));

918:   v = a->a;
919:   if (usecprow) {
920:     mbs  = a->compressedrow.nrows;
921:     ii   = a->compressedrow.i;
922:     ridx = a->compressedrow.rindex;
923:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
924:   } else {
925:     mbs = a->mbs;
926:     ii  = a->i;
927:     z   = zarray;
928:   }

930:   for (i = 0; i < mbs; i++) {
931:     n     = ii[i + 1] - ii[i];
932:     idx   = ij + ii[i];
933:     sum1  = 0.0;
934:     sum2  = 0.0;
935:     sum3  = 0.0;
936:     sum4  = 0.0;
937:     sum5  = 0.0;
938:     sum6  = 0.0;
939:     sum7  = 0.0;
940:     sum8  = 0.0;
941:     sum9  = 0.0;
942:     sum10 = 0.0;
943:     sum11 = 0.0;
944:     sum12 = 0.0;

946:     for (j = 0; j < n; j++) {
947:       xb = x + 12 * (idx[j]);

949:       for (k = 0; k < 12; k++) {
950:         xv = xb[k];
951:         sum1 += v[0] * xv;
952:         sum2 += v[1] * xv;
953:         sum3 += v[2] * xv;
954:         sum4 += v[3] * xv;
955:         sum5 += v[4] * xv;
956:         sum6 += v[5] * xv;
957:         sum7 += v[6] * xv;
958:         sum8 += v[7] * xv;
959:         sum9 += v[8] * xv;
960:         sum10 += v[9] * xv;
961:         sum11 += v[10] * xv;
962:         sum12 += v[11] * xv;
963:         v += 12;
964:       }
965:     }
966:     if (usecprow) z = zarray + 12 * ridx[i];
967:     z[0]  = sum1;
968:     z[1]  = sum2;
969:     z[2]  = sum3;
970:     z[3]  = sum4;
971:     z[4]  = sum5;
972:     z[5]  = sum6;
973:     z[6]  = sum7;
974:     z[7]  = sum8;
975:     z[8]  = sum9;
976:     z[9]  = sum10;
977:     z[10] = sum11;
978:     z[11] = sum12;
979:     if (!usecprow) z += 12;
980:   }
981:   PetscCall(VecRestoreArrayRead(xx, &x));
982:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
983:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A, Vec xx, Vec yy, Vec zz)
988: {
989:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
990:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
991:   const PetscScalar *x, *xb;
992:   PetscScalar       *zarray, *yarray, xv;
993:   const MatScalar   *v;
994:   const PetscInt    *ii, *ij = a->j, *idx;
995:   PetscInt           mbs = a->mbs, i, j, k, n, *ridx = NULL;
996:   PetscBool          usecprow = a->compressedrow.use;

998:   PetscFunctionBegin;
999:   PetscCall(VecGetArrayRead(xx, &x));
1000:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1002:   v = a->a;
1003:   if (usecprow) {
1004:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1005:     mbs  = a->compressedrow.nrows;
1006:     ii   = a->compressedrow.i;
1007:     ridx = a->compressedrow.rindex;
1008:   } else {
1009:     ii = a->i;
1010:     y  = yarray;
1011:     z  = zarray;
1012:   }

1014:   for (i = 0; i < mbs; i++) {
1015:     n   = ii[i + 1] - ii[i];
1016:     idx = ij + ii[i];

1018:     if (usecprow) {
1019:       y = yarray + 12 * ridx[i];
1020:       z = zarray + 12 * ridx[i];
1021:     }
1022:     sum1  = y[0];
1023:     sum2  = y[1];
1024:     sum3  = y[2];
1025:     sum4  = y[3];
1026:     sum5  = y[4];
1027:     sum6  = y[5];
1028:     sum7  = y[6];
1029:     sum8  = y[7];
1030:     sum9  = y[8];
1031:     sum10 = y[9];
1032:     sum11 = y[10];
1033:     sum12 = y[11];

1035:     for (j = 0; j < n; j++) {
1036:       xb = x + 12 * (idx[j]);

1038:       for (k = 0; k < 12; k++) {
1039:         xv = xb[k];
1040:         sum1 += v[0] * xv;
1041:         sum2 += v[1] * xv;
1042:         sum3 += v[2] * xv;
1043:         sum4 += v[3] * xv;
1044:         sum5 += v[4] * xv;
1045:         sum6 += v[5] * xv;
1046:         sum7 += v[6] * xv;
1047:         sum8 += v[7] * xv;
1048:         sum9 += v[8] * xv;
1049:         sum10 += v[9] * xv;
1050:         sum11 += v[10] * xv;
1051:         sum12 += v[11] * xv;
1052:         v += 12;
1053:       }
1054:     }

1056:     z[0]  = sum1;
1057:     z[1]  = sum2;
1058:     z[2]  = sum3;
1059:     z[3]  = sum4;
1060:     z[4]  = sum5;
1061:     z[5]  = sum6;
1062:     z[6]  = sum7;
1063:     z[7]  = sum8;
1064:     z[8]  = sum9;
1065:     z[9]  = sum10;
1066:     z[10] = sum11;
1067:     z[11] = sum12;
1068:     if (!usecprow) {
1069:       y += 12;
1070:       z += 12;
1071:     }
1072:   }
1073:   PetscCall(VecRestoreArrayRead(xx, &x));
1074:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1075:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1076:   PetscFunctionReturn(PETSC_SUCCESS);
1077: }

1079: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1080: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec zz)
1081: {
1082:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1083:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1084:   const PetscScalar *x, *xb;
1085:   PetscScalar        x1, x2, x3, x4, *zarray;
1086:   const MatScalar   *v;
1087:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1088:   PetscInt           mbs, i, j, n;
1089:   PetscBool          usecprow = a->compressedrow.use;

1091:   PetscFunctionBegin;
1092:   PetscCall(VecGetArrayRead(xx, &x));
1093:   PetscCall(VecGetArrayWrite(zz, &zarray));

1095:   v = a->a;
1096:   if (usecprow) {
1097:     mbs  = a->compressedrow.nrows;
1098:     ii   = a->compressedrow.i;
1099:     ridx = a->compressedrow.rindex;
1100:     PetscCall(PetscArrayzero(zarray, 12 * a->mbs));
1101:   } else {
1102:     mbs = a->mbs;
1103:     ii  = a->i;
1104:     z   = zarray;
1105:   }

1107:   for (i = 0; i < mbs; i++) {
1108:     n   = ii[i + 1] - ii[i];
1109:     idx = ij + ii[i];

1111:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1112:     for (j = 0; j < n; j++) {
1113:       xb = x + 12 * (idx[j]);
1114:       x1 = xb[0];
1115:       x2 = xb[1];
1116:       x3 = xb[2];
1117:       x4 = xb[3];

1119:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1120:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1121:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1122:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1123:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1124:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1125:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1126:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1127:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1128:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1129:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1130:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1131:       v += 48;

1133:       x1 = xb[4];
1134:       x2 = xb[5];
1135:       x3 = xb[6];
1136:       x4 = xb[7];

1138:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1139:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1140:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1141:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1142:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1143:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1144:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1145:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1146:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1147:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1148:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1149:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1150:       v += 48;

1152:       x1 = xb[8];
1153:       x2 = xb[9];
1154:       x3 = xb[10];
1155:       x4 = xb[11];
1156:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1157:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1158:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1159:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1160:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1161:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1162:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1163:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1164:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1165:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1166:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1167:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1168:       v += 48;
1169:     }
1170:     if (usecprow) z = zarray + 12 * ridx[i];
1171:     z[0]  = sum1;
1172:     z[1]  = sum2;
1173:     z[2]  = sum3;
1174:     z[3]  = sum4;
1175:     z[4]  = sum5;
1176:     z[5]  = sum6;
1177:     z[6]  = sum7;
1178:     z[7]  = sum8;
1179:     z[8]  = sum9;
1180:     z[9]  = sum10;
1181:     z[10] = sum11;
1182:     z[11] = sum12;
1183:     if (!usecprow) z += 12;
1184:   }
1185:   PetscCall(VecRestoreArrayRead(xx, &x));
1186:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1187:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1188:   PetscFunctionReturn(PETSC_SUCCESS);
1189: }

1191: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1192: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A, Vec xx, Vec yy, Vec zz)
1193: {
1194:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1195:   PetscScalar       *z = NULL, *y = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12;
1196:   const PetscScalar *x, *xb;
1197:   PetscScalar        x1, x2, x3, x4, *zarray, *yarray;
1198:   const MatScalar   *v;
1199:   const PetscInt    *ii, *ij = a->j, *idx, *ridx = NULL;
1200:   PetscInt           mbs      = a->mbs, i, j, n;
1201:   PetscBool          usecprow = a->compressedrow.use;

1203:   PetscFunctionBegin;
1204:   PetscCall(VecGetArrayRead(xx, &x));
1205:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

1207:   v = a->a;
1208:   if (usecprow) {
1209:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 12 * mbs));
1210:     mbs  = a->compressedrow.nrows;
1211:     ii   = a->compressedrow.i;
1212:     ridx = a->compressedrow.rindex;
1213:   } else {
1214:     ii = a->i;
1215:     y  = yarray;
1216:     z  = zarray;
1217:   }

1219:   for (i = 0; i < mbs; i++) {
1220:     n   = ii[i + 1] - ii[i];
1221:     idx = ij + ii[i];

1223:     if (usecprow) {
1224:       y = yarray + 12 * ridx[i];
1225:       z = zarray + 12 * ridx[i];
1226:     }
1227:     sum1  = y[0];
1228:     sum2  = y[1];
1229:     sum3  = y[2];
1230:     sum4  = y[3];
1231:     sum5  = y[4];
1232:     sum6  = y[5];
1233:     sum7  = y[6];
1234:     sum8  = y[7];
1235:     sum9  = y[8];
1236:     sum10 = y[9];
1237:     sum11 = y[10];
1238:     sum12 = y[11];

1240:     for (j = 0; j < n; j++) {
1241:       xb = x + 12 * (idx[j]);
1242:       x1 = xb[0];
1243:       x2 = xb[1];
1244:       x3 = xb[2];
1245:       x4 = xb[3];

1247:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1248:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1249:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1250:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1251:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1252:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1253:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1254:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1255:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1256:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1257:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1258:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1259:       v += 48;

1261:       x1 = xb[4];
1262:       x2 = xb[5];
1263:       x3 = xb[6];
1264:       x4 = xb[7];

1266:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1267:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1268:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1269:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1270:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1271:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1272:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1273:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1274:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1275:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1276:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1277:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1278:       v += 48;

1280:       x1 = xb[8];
1281:       x2 = xb[9];
1282:       x3 = xb[10];
1283:       x4 = xb[11];
1284:       sum1 += v[0] * x1 + v[12] * x2 + v[24] * x3 + v[36] * x4;
1285:       sum2 += v[1] * x1 + v[13] * x2 + v[25] * x3 + v[37] * x4;
1286:       sum3 += v[2] * x1 + v[14] * x2 + v[26] * x3 + v[38] * x4;
1287:       sum4 += v[3] * x1 + v[15] * x2 + v[27] * x3 + v[39] * x4;
1288:       sum5 += v[4] * x1 + v[16] * x2 + v[28] * x3 + v[40] * x4;
1289:       sum6 += v[5] * x1 + v[17] * x2 + v[29] * x3 + v[41] * x4;
1290:       sum7 += v[6] * x1 + v[18] * x2 + v[30] * x3 + v[42] * x4;
1291:       sum8 += v[7] * x1 + v[19] * x2 + v[31] * x3 + v[43] * x4;
1292:       sum9 += v[8] * x1 + v[20] * x2 + v[32] * x3 + v[44] * x4;
1293:       sum10 += v[9] * x1 + v[21] * x2 + v[33] * x3 + v[45] * x4;
1294:       sum11 += v[10] * x1 + v[22] * x2 + v[34] * x3 + v[46] * x4;
1295:       sum12 += v[11] * x1 + v[23] * x2 + v[35] * x3 + v[47] * x4;
1296:       v += 48;
1297:     }
1298:     z[0]  = sum1;
1299:     z[1]  = sum2;
1300:     z[2]  = sum3;
1301:     z[3]  = sum4;
1302:     z[4]  = sum5;
1303:     z[5]  = sum6;
1304:     z[6]  = sum7;
1305:     z[7]  = sum8;
1306:     z[8]  = sum9;
1307:     z[9]  = sum10;
1308:     z[10] = sum11;
1309:     z[11] = sum12;
1310:     if (!usecprow) {
1311:       y += 12;
1312:       z += 12;
1313:     }
1314:   }
1315:   PetscCall(VecRestoreArrayRead(xx, &x));
1316:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
1317:   PetscCall(PetscLogFlops(288.0 * a->nz - 12.0 * a->nonzerorowcnt));
1318:   PetscFunctionReturn(PETSC_SUCCESS);
1319: }

1321: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1322: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A, Vec xx, Vec zz)
1323: {
1324:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1325:   PetscScalar       *z = NULL, *zarray;
1326:   const PetscScalar *x, *work;
1327:   const MatScalar   *v = a->a;
1328:   PetscInt           mbs, i, j, n;
1329:   const PetscInt    *idx = a->j, *ii, *ridx = NULL;
1330:   PetscBool          usecprow = a->compressedrow.use;
1331:   const PetscInt     bs = 12, bs2 = 144;

1333:   __m256d a0, a1, a2, a3, a4, a5;
1334:   __m256d w0, w1, w2, w3;
1335:   __m256d z0, z1, z2;

1337:   PetscFunctionBegin;
1338:   PetscCall(VecGetArrayRead(xx, &x));
1339:   PetscCall(VecGetArrayWrite(zz, &zarray));

1341:   if (usecprow) {
1342:     mbs  = a->compressedrow.nrows;
1343:     ii   = a->compressedrow.i;
1344:     ridx = a->compressedrow.rindex;
1345:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
1346:   } else {
1347:     mbs = a->mbs;
1348:     ii  = a->i;
1349:     z   = zarray;
1350:   }

1352:   for (i = 0; i < mbs; i++) {
1353:     z0 = _mm256_setzero_pd();
1354:     z1 = _mm256_setzero_pd();
1355:     z2 = _mm256_setzero_pd();

1357:     n = ii[1] - ii[0];
1358:     ii++;
1359:     for (j = 0; j < n; j++) {
1360:       work = x + bs * (*idx++);

1362:       /* first column of a */
1363:       w0 = _mm256_set1_pd(work[0]);
1364:       a0 = _mm256_loadu_pd(v + 0);
1365:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1366:       a1 = _mm256_loadu_pd(v + 4);
1367:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1368:       a2 = _mm256_loadu_pd(v + 8);
1369:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1371:       /* second column of a */
1372:       w1 = _mm256_set1_pd(work[1]);
1373:       a3 = _mm256_loadu_pd(v + 12);
1374:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1375:       a4 = _mm256_loadu_pd(v + 16);
1376:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1377:       a5 = _mm256_loadu_pd(v + 20);
1378:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1380:       /* third column of a */
1381:       w2 = _mm256_set1_pd(work[2]);
1382:       a0 = _mm256_loadu_pd(v + 24);
1383:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1384:       a1 = _mm256_loadu_pd(v + 28);
1385:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1386:       a2 = _mm256_loadu_pd(v + 32);
1387:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1389:       /* fourth column of a */
1390:       w3 = _mm256_set1_pd(work[3]);
1391:       a3 = _mm256_loadu_pd(v + 36);
1392:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1393:       a4 = _mm256_loadu_pd(v + 40);
1394:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1395:       a5 = _mm256_loadu_pd(v + 44);
1396:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1398:       /* fifth column of a */
1399:       w0 = _mm256_set1_pd(work[4]);
1400:       a0 = _mm256_loadu_pd(v + 48);
1401:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1402:       a1 = _mm256_loadu_pd(v + 52);
1403:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1404:       a2 = _mm256_loadu_pd(v + 56);
1405:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1407:       /* sixth column of a */
1408:       w1 = _mm256_set1_pd(work[5]);
1409:       a3 = _mm256_loadu_pd(v + 60);
1410:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1411:       a4 = _mm256_loadu_pd(v + 64);
1412:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1413:       a5 = _mm256_loadu_pd(v + 68);
1414:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1416:       /* seventh column of a */
1417:       w2 = _mm256_set1_pd(work[6]);
1418:       a0 = _mm256_loadu_pd(v + 72);
1419:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1420:       a1 = _mm256_loadu_pd(v + 76);
1421:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1422:       a2 = _mm256_loadu_pd(v + 80);
1423:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1425:       /* eighth column of a */
1426:       w3 = _mm256_set1_pd(work[7]);
1427:       a3 = _mm256_loadu_pd(v + 84);
1428:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1429:       a4 = _mm256_loadu_pd(v + 88);
1430:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1431:       a5 = _mm256_loadu_pd(v + 92);
1432:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1434:       /* ninth column of a */
1435:       w0 = _mm256_set1_pd(work[8]);
1436:       a0 = _mm256_loadu_pd(v + 96);
1437:       z0 = _mm256_fmadd_pd(a0, w0, z0);
1438:       a1 = _mm256_loadu_pd(v + 100);
1439:       z1 = _mm256_fmadd_pd(a1, w0, z1);
1440:       a2 = _mm256_loadu_pd(v + 104);
1441:       z2 = _mm256_fmadd_pd(a2, w0, z2);

1443:       /* tenth column of a */
1444:       w1 = _mm256_set1_pd(work[9]);
1445:       a3 = _mm256_loadu_pd(v + 108);
1446:       z0 = _mm256_fmadd_pd(a3, w1, z0);
1447:       a4 = _mm256_loadu_pd(v + 112);
1448:       z1 = _mm256_fmadd_pd(a4, w1, z1);
1449:       a5 = _mm256_loadu_pd(v + 116);
1450:       z2 = _mm256_fmadd_pd(a5, w1, z2);

1452:       /* eleventh column of a */
1453:       w2 = _mm256_set1_pd(work[10]);
1454:       a0 = _mm256_loadu_pd(v + 120);
1455:       z0 = _mm256_fmadd_pd(a0, w2, z0);
1456:       a1 = _mm256_loadu_pd(v + 124);
1457:       z1 = _mm256_fmadd_pd(a1, w2, z1);
1458:       a2 = _mm256_loadu_pd(v + 128);
1459:       z2 = _mm256_fmadd_pd(a2, w2, z2);

1461:       /* twelveth column of a */
1462:       w3 = _mm256_set1_pd(work[11]);
1463:       a3 = _mm256_loadu_pd(v + 132);
1464:       z0 = _mm256_fmadd_pd(a3, w3, z0);
1465:       a4 = _mm256_loadu_pd(v + 136);
1466:       z1 = _mm256_fmadd_pd(a4, w3, z1);
1467:       a5 = _mm256_loadu_pd(v + 140);
1468:       z2 = _mm256_fmadd_pd(a5, w3, z2);

1470:       v += bs2;
1471:     }
1472:     if (usecprow) z = zarray + bs * ridx[i];
1473:     _mm256_storeu_pd(&z[0], z0);
1474:     _mm256_storeu_pd(&z[4], z1);
1475:     _mm256_storeu_pd(&z[8], z2);
1476:     if (!usecprow) z += bs;
1477:   }
1478:   PetscCall(VecRestoreArrayRead(xx, &x));
1479:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1480:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
1481:   PetscFunctionReturn(PETSC_SUCCESS);
1482: }
1483: #endif

1485: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1486: /* Default MatMult for block size 15 */
1487: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A, Vec xx, Vec zz)
1488: {
1489:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1490:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1491:   const PetscScalar *x, *xb;
1492:   PetscScalar       *zarray, xv;
1493:   const MatScalar   *v;
1494:   const PetscInt    *ii, *ij = a->j, *idx;
1495:   PetscInt           mbs, i, j, k, n, *ridx = NULL;
1496:   PetscBool          usecprow = a->compressedrow.use;

1498:   PetscFunctionBegin;
1499:   PetscCall(VecGetArrayRead(xx, &x));
1500:   PetscCall(VecGetArrayWrite(zz, &zarray));

1502:   v = a->a;
1503:   if (usecprow) {
1504:     mbs  = a->compressedrow.nrows;
1505:     ii   = a->compressedrow.i;
1506:     ridx = a->compressedrow.rindex;
1507:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1508:   } else {
1509:     mbs = a->mbs;
1510:     ii  = a->i;
1511:     z   = zarray;
1512:   }

1514:   for (i = 0; i < mbs; i++) {
1515:     n     = ii[i + 1] - ii[i];
1516:     idx   = ij + ii[i];
1517:     sum1  = 0.0;
1518:     sum2  = 0.0;
1519:     sum3  = 0.0;
1520:     sum4  = 0.0;
1521:     sum5  = 0.0;
1522:     sum6  = 0.0;
1523:     sum7  = 0.0;
1524:     sum8  = 0.0;
1525:     sum9  = 0.0;
1526:     sum10 = 0.0;
1527:     sum11 = 0.0;
1528:     sum12 = 0.0;
1529:     sum13 = 0.0;
1530:     sum14 = 0.0;
1531:     sum15 = 0.0;

1533:     for (j = 0; j < n; j++) {
1534:       xb = x + 15 * (idx[j]);

1536:       for (k = 0; k < 15; k++) {
1537:         xv = xb[k];
1538:         sum1 += v[0] * xv;
1539:         sum2 += v[1] * xv;
1540:         sum3 += v[2] * xv;
1541:         sum4 += v[3] * xv;
1542:         sum5 += v[4] * xv;
1543:         sum6 += v[5] * xv;
1544:         sum7 += v[6] * xv;
1545:         sum8 += v[7] * xv;
1546:         sum9 += v[8] * xv;
1547:         sum10 += v[9] * xv;
1548:         sum11 += v[10] * xv;
1549:         sum12 += v[11] * xv;
1550:         sum13 += v[12] * xv;
1551:         sum14 += v[13] * xv;
1552:         sum15 += v[14] * xv;
1553:         v += 15;
1554:       }
1555:     }
1556:     if (usecprow) z = zarray + 15 * ridx[i];
1557:     z[0]  = sum1;
1558:     z[1]  = sum2;
1559:     z[2]  = sum3;
1560:     z[3]  = sum4;
1561:     z[4]  = sum5;
1562:     z[5]  = sum6;
1563:     z[6]  = sum7;
1564:     z[7]  = sum8;
1565:     z[8]  = sum9;
1566:     z[9]  = sum10;
1567:     z[10] = sum11;
1568:     z[11] = sum12;
1569:     z[12] = sum13;
1570:     z[13] = sum14;
1571:     z[14] = sum15;

1573:     if (!usecprow) z += 15;
1574:   }

1576:   PetscCall(VecRestoreArrayRead(xx, &x));
1577:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1578:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1579:   PetscFunctionReturn(PETSC_SUCCESS);
1580: }

1582: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1583: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A, Vec xx, Vec zz)
1584: {
1585:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1586:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1587:   const PetscScalar *x, *xb;
1588:   PetscScalar        x1, x2, x3, x4, *zarray;
1589:   const MatScalar   *v;
1590:   const PetscInt    *ii, *ij = a->j, *idx;
1591:   PetscInt           mbs, i, j, n, *ridx = NULL;
1592:   PetscBool          usecprow = a->compressedrow.use;

1594:   PetscFunctionBegin;
1595:   PetscCall(VecGetArrayRead(xx, &x));
1596:   PetscCall(VecGetArrayWrite(zz, &zarray));

1598:   v = a->a;
1599:   if (usecprow) {
1600:     mbs  = a->compressedrow.nrows;
1601:     ii   = a->compressedrow.i;
1602:     ridx = a->compressedrow.rindex;
1603:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1604:   } else {
1605:     mbs = a->mbs;
1606:     ii  = a->i;
1607:     z   = zarray;
1608:   }

1610:   for (i = 0; i < mbs; i++) {
1611:     n     = ii[i + 1] - ii[i];
1612:     idx   = ij + ii[i];
1613:     sum1  = 0.0;
1614:     sum2  = 0.0;
1615:     sum3  = 0.0;
1616:     sum4  = 0.0;
1617:     sum5  = 0.0;
1618:     sum6  = 0.0;
1619:     sum7  = 0.0;
1620:     sum8  = 0.0;
1621:     sum9  = 0.0;
1622:     sum10 = 0.0;
1623:     sum11 = 0.0;
1624:     sum12 = 0.0;
1625:     sum13 = 0.0;
1626:     sum14 = 0.0;
1627:     sum15 = 0.0;

1629:     for (j = 0; j < n; j++) {
1630:       xb = x + 15 * (idx[j]);
1631:       x1 = xb[0];
1632:       x2 = xb[1];
1633:       x3 = xb[2];
1634:       x4 = xb[3];

1636:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1637:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1638:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1639:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1640:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1641:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1642:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1643:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1644:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1645:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1646:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1647:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1648:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1649:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1650:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;

1652:       v += 60;

1654:       x1 = xb[4];
1655:       x2 = xb[5];
1656:       x3 = xb[6];
1657:       x4 = xb[7];

1659:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1660:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1661:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1662:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1663:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1664:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1665:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1666:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1667:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1668:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1669:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1670:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1671:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1672:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1673:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1674:       v += 60;

1676:       x1 = xb[8];
1677:       x2 = xb[9];
1678:       x3 = xb[10];
1679:       x4 = xb[11];
1680:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4;
1681:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4;
1682:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4;
1683:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4;
1684:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4;
1685:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4;
1686:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4;
1687:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4;
1688:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4;
1689:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4;
1690:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4;
1691:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4;
1692:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4;
1693:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4;
1694:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4;
1695:       v += 60;

1697:       x1 = xb[12];
1698:       x2 = xb[13];
1699:       x3 = xb[14];
1700:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3;
1701:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3;
1702:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3;
1703:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3;
1704:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3;
1705:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3;
1706:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3;
1707:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3;
1708:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3;
1709:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3;
1710:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3;
1711:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3;
1712:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3;
1713:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3;
1714:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3;
1715:       v += 45;
1716:     }
1717:     if (usecprow) z = zarray + 15 * ridx[i];
1718:     z[0]  = sum1;
1719:     z[1]  = sum2;
1720:     z[2]  = sum3;
1721:     z[3]  = sum4;
1722:     z[4]  = sum5;
1723:     z[5]  = sum6;
1724:     z[6]  = sum7;
1725:     z[7]  = sum8;
1726:     z[8]  = sum9;
1727:     z[9]  = sum10;
1728:     z[10] = sum11;
1729:     z[11] = sum12;
1730:     z[12] = sum13;
1731:     z[13] = sum14;
1732:     z[14] = sum15;

1734:     if (!usecprow) z += 15;
1735:   }

1737:   PetscCall(VecRestoreArrayRead(xx, &x));
1738:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1739:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1740:   PetscFunctionReturn(PETSC_SUCCESS);
1741: }

1743: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1744: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A, Vec xx, Vec zz)
1745: {
1746:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1747:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1748:   const PetscScalar *x, *xb;
1749:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, *zarray;
1750:   const MatScalar   *v;
1751:   const PetscInt    *ii, *ij = a->j, *idx;
1752:   PetscInt           mbs, i, j, n, *ridx = NULL;
1753:   PetscBool          usecprow = a->compressedrow.use;

1755:   PetscFunctionBegin;
1756:   PetscCall(VecGetArrayRead(xx, &x));
1757:   PetscCall(VecGetArrayWrite(zz, &zarray));

1759:   v = a->a;
1760:   if (usecprow) {
1761:     mbs  = a->compressedrow.nrows;
1762:     ii   = a->compressedrow.i;
1763:     ridx = a->compressedrow.rindex;
1764:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1765:   } else {
1766:     mbs = a->mbs;
1767:     ii  = a->i;
1768:     z   = zarray;
1769:   }

1771:   for (i = 0; i < mbs; i++) {
1772:     n     = ii[i + 1] - ii[i];
1773:     idx   = ij + ii[i];
1774:     sum1  = 0.0;
1775:     sum2  = 0.0;
1776:     sum3  = 0.0;
1777:     sum4  = 0.0;
1778:     sum5  = 0.0;
1779:     sum6  = 0.0;
1780:     sum7  = 0.0;
1781:     sum8  = 0.0;
1782:     sum9  = 0.0;
1783:     sum10 = 0.0;
1784:     sum11 = 0.0;
1785:     sum12 = 0.0;
1786:     sum13 = 0.0;
1787:     sum14 = 0.0;
1788:     sum15 = 0.0;

1790:     for (j = 0; j < n; j++) {
1791:       xb = x + 15 * (idx[j]);
1792:       x1 = xb[0];
1793:       x2 = xb[1];
1794:       x3 = xb[2];
1795:       x4 = xb[3];
1796:       x5 = xb[4];
1797:       x6 = xb[5];
1798:       x7 = xb[6];
1799:       x8 = xb[7];

1801:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8;
1802:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8;
1803:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8;
1804:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8;
1805:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8;
1806:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8;
1807:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8;
1808:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8;
1809:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8;
1810:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8;
1811:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8;
1812:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8;
1813:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8;
1814:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8;
1815:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8;
1816:       v += 120;

1818:       x1 = xb[8];
1819:       x2 = xb[9];
1820:       x3 = xb[10];
1821:       x4 = xb[11];
1822:       x5 = xb[12];
1823:       x6 = xb[13];
1824:       x7 = xb[14];

1826:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7;
1827:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7;
1828:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7;
1829:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7;
1830:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7;
1831:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7;
1832:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7;
1833:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7;
1834:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7;
1835:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7;
1836:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7;
1837:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7;
1838:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7;
1839:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7;
1840:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7;
1841:       v += 105;
1842:     }
1843:     if (usecprow) z = zarray + 15 * ridx[i];
1844:     z[0]  = sum1;
1845:     z[1]  = sum2;
1846:     z[2]  = sum3;
1847:     z[3]  = sum4;
1848:     z[4]  = sum5;
1849:     z[5]  = sum6;
1850:     z[6]  = sum7;
1851:     z[7]  = sum8;
1852:     z[8]  = sum9;
1853:     z[9]  = sum10;
1854:     z[10] = sum11;
1855:     z[11] = sum12;
1856:     z[12] = sum13;
1857:     z[13] = sum14;
1858:     z[14] = sum15;

1860:     if (!usecprow) z += 15;
1861:   }

1863:   PetscCall(VecRestoreArrayRead(xx, &x));
1864:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1865:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1866:   PetscFunctionReturn(PETSC_SUCCESS);
1867: }

1869: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1870: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A, Vec xx, Vec zz)
1871: {
1872:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1873:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11, sum12, sum13, sum14, sum15;
1874:   const PetscScalar *x, *xb;
1875:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, *zarray;
1876:   const MatScalar   *v;
1877:   const PetscInt    *ii, *ij = a->j, *idx;
1878:   PetscInt           mbs, i, j, n, *ridx = NULL;
1879:   PetscBool          usecprow = a->compressedrow.use;

1881:   PetscFunctionBegin;
1882:   PetscCall(VecGetArrayRead(xx, &x));
1883:   PetscCall(VecGetArrayWrite(zz, &zarray));

1885:   v = a->a;
1886:   if (usecprow) {
1887:     mbs  = a->compressedrow.nrows;
1888:     ii   = a->compressedrow.i;
1889:     ridx = a->compressedrow.rindex;
1890:     PetscCall(PetscArrayzero(zarray, 15 * a->mbs));
1891:   } else {
1892:     mbs = a->mbs;
1893:     ii  = a->i;
1894:     z   = zarray;
1895:   }

1897:   for (i = 0; i < mbs; i++) {
1898:     n     = ii[i + 1] - ii[i];
1899:     idx   = ij + ii[i];
1900:     sum1  = 0.0;
1901:     sum2  = 0.0;
1902:     sum3  = 0.0;
1903:     sum4  = 0.0;
1904:     sum5  = 0.0;
1905:     sum6  = 0.0;
1906:     sum7  = 0.0;
1907:     sum8  = 0.0;
1908:     sum9  = 0.0;
1909:     sum10 = 0.0;
1910:     sum11 = 0.0;
1911:     sum12 = 0.0;
1912:     sum13 = 0.0;
1913:     sum14 = 0.0;
1914:     sum15 = 0.0;

1916:     for (j = 0; j < n; j++) {
1917:       xb  = x + 15 * (idx[j]);
1918:       x1  = xb[0];
1919:       x2  = xb[1];
1920:       x3  = xb[2];
1921:       x4  = xb[3];
1922:       x5  = xb[4];
1923:       x6  = xb[5];
1924:       x7  = xb[6];
1925:       x8  = xb[7];
1926:       x9  = xb[8];
1927:       x10 = xb[9];
1928:       x11 = xb[10];
1929:       x12 = xb[11];
1930:       x13 = xb[12];
1931:       x14 = xb[13];
1932:       x15 = xb[14];

1934:       sum1 += v[0] * x1 + v[15] * x2 + v[30] * x3 + v[45] * x4 + v[60] * x5 + v[75] * x6 + v[90] * x7 + v[105] * x8 + v[120] * x9 + v[135] * x10 + v[150] * x11 + v[165] * x12 + v[180] * x13 + v[195] * x14 + v[210] * x15;
1935:       sum2 += v[1] * x1 + v[16] * x2 + v[31] * x3 + v[46] * x4 + v[61] * x5 + v[76] * x6 + v[91] * x7 + v[106] * x8 + v[121] * x9 + v[136] * x10 + v[151] * x11 + v[166] * x12 + v[181] * x13 + v[196] * x14 + v[211] * x15;
1936:       sum3 += v[2] * x1 + v[17] * x2 + v[32] * x3 + v[47] * x4 + v[62] * x5 + v[77] * x6 + v[92] * x7 + v[107] * x8 + v[122] * x9 + v[137] * x10 + v[152] * x11 + v[167] * x12 + v[182] * x13 + v[197] * x14 + v[212] * x15;
1937:       sum4 += v[3] * x1 + v[18] * x2 + v[33] * x3 + v[48] * x4 + v[63] * x5 + v[78] * x6 + v[93] * x7 + v[108] * x8 + v[123] * x9 + v[138] * x10 + v[153] * x11 + v[168] * x12 + v[183] * x13 + v[198] * x14 + v[213] * x15;
1938:       sum5 += v[4] * x1 + v[19] * x2 + v[34] * x3 + v[49] * x4 + v[64] * x5 + v[79] * x6 + v[94] * x7 + v[109] * x8 + v[124] * x9 + v[139] * x10 + v[154] * x11 + v[169] * x12 + v[184] * x13 + v[199] * x14 + v[214] * x15;
1939:       sum6 += v[5] * x1 + v[20] * x2 + v[35] * x3 + v[50] * x4 + v[65] * x5 + v[80] * x6 + v[95] * x7 + v[110] * x8 + v[125] * x9 + v[140] * x10 + v[155] * x11 + v[170] * x12 + v[185] * x13 + v[200] * x14 + v[215] * x15;
1940:       sum7 += v[6] * x1 + v[21] * x2 + v[36] * x3 + v[51] * x4 + v[66] * x5 + v[81] * x6 + v[96] * x7 + v[111] * x8 + v[126] * x9 + v[141] * x10 + v[156] * x11 + v[171] * x12 + v[186] * x13 + v[201] * x14 + v[216] * x15;
1941:       sum8 += v[7] * x1 + v[22] * x2 + v[37] * x3 + v[52] * x4 + v[67] * x5 + v[82] * x6 + v[97] * x7 + v[112] * x8 + v[127] * x9 + v[142] * x10 + v[157] * x11 + v[172] * x12 + v[187] * x13 + v[202] * x14 + v[217] * x15;
1942:       sum9 += v[8] * x1 + v[23] * x2 + v[38] * x3 + v[53] * x4 + v[68] * x5 + v[83] * x6 + v[98] * x7 + v[113] * x8 + v[128] * x9 + v[143] * x10 + v[158] * x11 + v[173] * x12 + v[188] * x13 + v[203] * x14 + v[218] * x15;
1943:       sum10 += v[9] * x1 + v[24] * x2 + v[39] * x3 + v[54] * x4 + v[69] * x5 + v[84] * x6 + v[99] * x7 + v[114] * x8 + v[129] * x9 + v[144] * x10 + v[159] * x11 + v[174] * x12 + v[189] * x13 + v[204] * x14 + v[219] * x15;
1944:       sum11 += v[10] * x1 + v[25] * x2 + v[40] * x3 + v[55] * x4 + v[70] * x5 + v[85] * x6 + v[100] * x7 + v[115] * x8 + v[130] * x9 + v[145] * x10 + v[160] * x11 + v[175] * x12 + v[190] * x13 + v[205] * x14 + v[220] * x15;
1945:       sum12 += v[11] * x1 + v[26] * x2 + v[41] * x3 + v[56] * x4 + v[71] * x5 + v[86] * x6 + v[101] * x7 + v[116] * x8 + v[131] * x9 + v[146] * x10 + v[161] * x11 + v[176] * x12 + v[191] * x13 + v[206] * x14 + v[221] * x15;
1946:       sum13 += v[12] * x1 + v[27] * x2 + v[42] * x3 + v[57] * x4 + v[72] * x5 + v[87] * x6 + v[102] * x7 + v[117] * x8 + v[132] * x9 + v[147] * x10 + v[162] * x11 + v[177] * x12 + v[192] * x13 + v[207] * x14 + v[222] * x15;
1947:       sum14 += v[13] * x1 + v[28] * x2 + v[43] * x3 + v[58] * x4 + v[73] * x5 + v[88] * x6 + v[103] * x7 + v[118] * x8 + v[133] * x9 + v[148] * x10 + v[163] * x11 + v[178] * x12 + v[193] * x13 + v[208] * x14 + v[223] * x15;
1948:       sum15 += v[14] * x1 + v[29] * x2 + v[44] * x3 + v[59] * x4 + v[74] * x5 + v[89] * x6 + v[104] * x7 + v[119] * x8 + v[134] * x9 + v[149] * x10 + v[164] * x11 + v[179] * x12 + v[194] * x13 + v[209] * x14 + v[224] * x15;
1949:       v += 225;
1950:     }
1951:     if (usecprow) z = zarray + 15 * ridx[i];
1952:     z[0]  = sum1;
1953:     z[1]  = sum2;
1954:     z[2]  = sum3;
1955:     z[3]  = sum4;
1956:     z[4]  = sum5;
1957:     z[5]  = sum6;
1958:     z[6]  = sum7;
1959:     z[7]  = sum8;
1960:     z[8]  = sum9;
1961:     z[9]  = sum10;
1962:     z[10] = sum11;
1963:     z[11] = sum12;
1964:     z[12] = sum13;
1965:     z[13] = sum14;
1966:     z[14] = sum15;

1968:     if (!usecprow) z += 15;
1969:   }

1971:   PetscCall(VecRestoreArrayRead(xx, &x));
1972:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
1973:   PetscCall(PetscLogFlops(450.0 * a->nz - 15.0 * a->nonzerorowcnt));
1974:   PetscFunctionReturn(PETSC_SUCCESS);
1975: }

1977: /*
1978:     This will not work with MatScalar == float because it calls the BLAS
1979: */
1980: PetscErrorCode MatMult_SeqBAIJ_N(Mat A, Vec xx, Vec zz)
1981: {
1982:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1983:   PetscScalar       *z = NULL, *work, *workt, *zarray;
1984:   const PetscScalar *x, *xb;
1985:   const MatScalar   *v;
1986:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1987:   const PetscInt    *idx, *ii, *ridx = NULL;
1988:   PetscInt           ncols, k;
1989:   PetscBool          usecprow = a->compressedrow.use;

1991:   PetscFunctionBegin;
1992:   PetscCall(VecGetArrayRead(xx, &x));
1993:   PetscCall(VecGetArrayWrite(zz, &zarray));

1995:   idx = a->j;
1996:   v   = a->a;
1997:   if (usecprow) {
1998:     mbs  = a->compressedrow.nrows;
1999:     ii   = a->compressedrow.i;
2000:     ridx = a->compressedrow.rindex;
2001:     PetscCall(PetscArrayzero(zarray, bs * a->mbs));
2002:   } else {
2003:     mbs = a->mbs;
2004:     ii  = a->i;
2005:     z   = zarray;
2006:   }

2008:   if (!a->mult_work) {
2009:     k = PetscMax(A->rmap->n, A->cmap->n);
2010:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2011:   }
2012:   work = a->mult_work;
2013:   for (i = 0; i < mbs; i++) {
2014:     n = ii[1] - ii[0];
2015:     ii++;
2016:     ncols = n * bs;
2017:     workt = work;
2018:     for (j = 0; j < n; j++) {
2019:       xb = x + bs * (*idx++);
2020:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2021:       workt += bs;
2022:     }
2023:     if (usecprow) z = zarray + bs * ridx[i];
2024:     PetscKernel_w_gets_Ar_times_v(bs, ncols, work, v, z);
2025:     v += n * bs2;
2026:     if (!usecprow) z += bs;
2027:   }
2028:   PetscCall(VecRestoreArrayRead(xx, &x));
2029:   PetscCall(VecRestoreArrayWrite(zz, &zarray));
2030:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2 - bs * a->nonzerorowcnt));
2031:   PetscFunctionReturn(PETSC_SUCCESS);
2032: }

2034: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
2035: {
2036:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2037:   const PetscScalar *x;
2038:   PetscScalar       *y, *z, sum;
2039:   const MatScalar   *v;
2040:   PetscInt           mbs = a->mbs, i, n, *ridx = NULL;
2041:   const PetscInt    *idx, *ii;
2042:   PetscBool          usecprow = a->compressedrow.use;

2044:   PetscFunctionBegin;
2045:   PetscCall(VecGetArrayRead(xx, &x));
2046:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));

2048:   idx = a->j;
2049:   v   = a->a;
2050:   if (usecprow) {
2051:     if (zz != yy) PetscCall(PetscArraycpy(z, y, mbs));
2052:     mbs  = a->compressedrow.nrows;
2053:     ii   = a->compressedrow.i;
2054:     ridx = a->compressedrow.rindex;
2055:   } else {
2056:     ii = a->i;
2057:   }

2059:   for (i = 0; i < mbs; i++) {
2060:     n = ii[1] - ii[0];
2061:     ii++;
2062:     if (!usecprow) {
2063:       sum = y[i];
2064:     } else {
2065:       sum = y[ridx[i]];
2066:     }
2067:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2068:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
2069:     PetscSparseDensePlusDot(sum, x, v, idx, n);
2070:     v += n;
2071:     idx += n;
2072:     if (usecprow) {
2073:       z[ridx[i]] = sum;
2074:     } else {
2075:       z[i] = sum;
2076:     }
2077:   }
2078:   PetscCall(VecRestoreArrayRead(xx, &x));
2079:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
2080:   PetscCall(PetscLogFlops(2.0 * a->nz));
2081:   PetscFunctionReturn(PETSC_SUCCESS);
2082: }

2084: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
2085: {
2086:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2087:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2;
2088:   const PetscScalar *x, *xb;
2089:   PetscScalar        x1, x2, *yarray, *zarray;
2090:   const MatScalar   *v;
2091:   PetscInt           mbs = a->mbs, i, n, j;
2092:   const PetscInt    *idx, *ii, *ridx = NULL;
2093:   PetscBool          usecprow = a->compressedrow.use;

2095:   PetscFunctionBegin;
2096:   PetscCall(VecGetArrayRead(xx, &x));
2097:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2099:   idx = a->j;
2100:   v   = a->a;
2101:   if (usecprow) {
2102:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 2 * mbs));
2103:     mbs  = a->compressedrow.nrows;
2104:     ii   = a->compressedrow.i;
2105:     ridx = a->compressedrow.rindex;
2106:   } else {
2107:     ii = a->i;
2108:     y  = yarray;
2109:     z  = zarray;
2110:   }

2112:   for (i = 0; i < mbs; i++) {
2113:     n = ii[1] - ii[0];
2114:     ii++;
2115:     if (usecprow) {
2116:       z = zarray + 2 * ridx[i];
2117:       y = yarray + 2 * ridx[i];
2118:     }
2119:     sum1 = y[0];
2120:     sum2 = y[1];
2121:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2122:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2123:     for (j = 0; j < n; j++) {
2124:       xb = x + 2 * (*idx++);
2125:       x1 = xb[0];
2126:       x2 = xb[1];

2128:       sum1 += v[0] * x1 + v[2] * x2;
2129:       sum2 += v[1] * x1 + v[3] * x2;
2130:       v += 4;
2131:     }
2132:     z[0] = sum1;
2133:     z[1] = sum2;
2134:     if (!usecprow) {
2135:       z += 2;
2136:       y += 2;
2137:     }
2138:   }
2139:   PetscCall(VecRestoreArrayRead(xx, &x));
2140:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2141:   PetscCall(PetscLogFlops(4.0 * a->nz));
2142:   PetscFunctionReturn(PETSC_SUCCESS);
2143: }

2145: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
2146: {
2147:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2148:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, x1, x2, x3, *yarray, *zarray;
2149:   const PetscScalar *x, *xb;
2150:   const MatScalar   *v;
2151:   PetscInt           mbs = a->mbs, i, j, n;
2152:   const PetscInt    *idx, *ii, *ridx = NULL;
2153:   PetscBool          usecprow = a->compressedrow.use;

2155:   PetscFunctionBegin;
2156:   PetscCall(VecGetArrayRead(xx, &x));
2157:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2159:   idx = a->j;
2160:   v   = a->a;
2161:   if (usecprow) {
2162:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 3 * mbs));
2163:     mbs  = a->compressedrow.nrows;
2164:     ii   = a->compressedrow.i;
2165:     ridx = a->compressedrow.rindex;
2166:   } else {
2167:     ii = a->i;
2168:     y  = yarray;
2169:     z  = zarray;
2170:   }

2172:   for (i = 0; i < mbs; i++) {
2173:     n = ii[1] - ii[0];
2174:     ii++;
2175:     if (usecprow) {
2176:       z = zarray + 3 * ridx[i];
2177:       y = yarray + 3 * ridx[i];
2178:     }
2179:     sum1 = y[0];
2180:     sum2 = y[1];
2181:     sum3 = y[2];
2182:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
2183:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2184:     for (j = 0; j < n; j++) {
2185:       xb = x + 3 * (*idx++);
2186:       x1 = xb[0];
2187:       x2 = xb[1];
2188:       x3 = xb[2];
2189:       sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
2190:       sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
2191:       sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
2192:       v += 9;
2193:     }
2194:     z[0] = sum1;
2195:     z[1] = sum2;
2196:     z[2] = sum3;
2197:     if (!usecprow) {
2198:       z += 3;
2199:       y += 3;
2200:     }
2201:   }
2202:   PetscCall(VecRestoreArrayRead(xx, &x));
2203:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2204:   PetscCall(PetscLogFlops(18.0 * a->nz));
2205:   PetscFunctionReturn(PETSC_SUCCESS);
2206: }

2208: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
2209: {
2210:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2211:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, x1, x2, x3, x4, *yarray, *zarray;
2212:   const PetscScalar *x, *xb;
2213:   const MatScalar   *v;
2214:   PetscInt           mbs = a->mbs, i, j, n;
2215:   const PetscInt    *idx, *ii, *ridx = NULL;
2216:   PetscBool          usecprow = a->compressedrow.use;

2218:   PetscFunctionBegin;
2219:   PetscCall(VecGetArrayRead(xx, &x));
2220:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2222:   idx = a->j;
2223:   v   = a->a;
2224:   if (usecprow) {
2225:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 4 * mbs));
2226:     mbs  = a->compressedrow.nrows;
2227:     ii   = a->compressedrow.i;
2228:     ridx = a->compressedrow.rindex;
2229:   } else {
2230:     ii = a->i;
2231:     y  = yarray;
2232:     z  = zarray;
2233:   }

2235:   for (i = 0; i < mbs; i++) {
2236:     n = ii[1] - ii[0];
2237:     ii++;
2238:     if (usecprow) {
2239:       z = zarray + 4 * ridx[i];
2240:       y = yarray + 4 * ridx[i];
2241:     }
2242:     sum1 = y[0];
2243:     sum2 = y[1];
2244:     sum3 = y[2];
2245:     sum4 = y[3];
2246:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2247:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2248:     for (j = 0; j < n; j++) {
2249:       xb = x + 4 * (*idx++);
2250:       x1 = xb[0];
2251:       x2 = xb[1];
2252:       x3 = xb[2];
2253:       x4 = xb[3];
2254:       sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
2255:       sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
2256:       sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
2257:       sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
2258:       v += 16;
2259:     }
2260:     z[0] = sum1;
2261:     z[1] = sum2;
2262:     z[2] = sum3;
2263:     z[3] = sum4;
2264:     if (!usecprow) {
2265:       z += 4;
2266:       y += 4;
2267:     }
2268:   }
2269:   PetscCall(VecRestoreArrayRead(xx, &x));
2270:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2271:   PetscCall(PetscLogFlops(32.0 * a->nz));
2272:   PetscFunctionReturn(PETSC_SUCCESS);
2273: }

2275: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
2276: {
2277:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2278:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, x1, x2, x3, x4, x5;
2279:   const PetscScalar *x, *xb;
2280:   PetscScalar       *yarray, *zarray;
2281:   const MatScalar   *v;
2282:   PetscInt           mbs = a->mbs, i, j, n;
2283:   const PetscInt    *idx, *ii, *ridx = NULL;
2284:   PetscBool          usecprow = a->compressedrow.use;

2286:   PetscFunctionBegin;
2287:   PetscCall(VecGetArrayRead(xx, &x));
2288:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2290:   idx = a->j;
2291:   v   = a->a;
2292:   if (usecprow) {
2293:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 5 * mbs));
2294:     mbs  = a->compressedrow.nrows;
2295:     ii   = a->compressedrow.i;
2296:     ridx = a->compressedrow.rindex;
2297:   } else {
2298:     ii = a->i;
2299:     y  = yarray;
2300:     z  = zarray;
2301:   }

2303:   for (i = 0; i < mbs; i++) {
2304:     n = ii[1] - ii[0];
2305:     ii++;
2306:     if (usecprow) {
2307:       z = zarray + 5 * ridx[i];
2308:       y = yarray + 5 * ridx[i];
2309:     }
2310:     sum1 = y[0];
2311:     sum2 = y[1];
2312:     sum3 = y[2];
2313:     sum4 = y[3];
2314:     sum5 = y[4];
2315:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2316:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2317:     for (j = 0; j < n; j++) {
2318:       xb = x + 5 * (*idx++);
2319:       x1 = xb[0];
2320:       x2 = xb[1];
2321:       x3 = xb[2];
2322:       x4 = xb[3];
2323:       x5 = xb[4];
2324:       sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
2325:       sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
2326:       sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
2327:       sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
2328:       sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
2329:       v += 25;
2330:     }
2331:     z[0] = sum1;
2332:     z[1] = sum2;
2333:     z[2] = sum3;
2334:     z[3] = sum4;
2335:     z[4] = sum5;
2336:     if (!usecprow) {
2337:       z += 5;
2338:       y += 5;
2339:     }
2340:   }
2341:   PetscCall(VecRestoreArrayRead(xx, &x));
2342:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2343:   PetscCall(PetscLogFlops(50.0 * a->nz));
2344:   PetscFunctionReturn(PETSC_SUCCESS);
2345: }

2347: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
2348: {
2349:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2350:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6;
2351:   const PetscScalar *x, *xb;
2352:   PetscScalar        x1, x2, x3, x4, x5, x6, *yarray, *zarray;
2353:   const MatScalar   *v;
2354:   PetscInt           mbs = a->mbs, i, j, n;
2355:   const PetscInt    *idx, *ii, *ridx = NULL;
2356:   PetscBool          usecprow = a->compressedrow.use;

2358:   PetscFunctionBegin;
2359:   PetscCall(VecGetArrayRead(xx, &x));
2360:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2362:   idx = a->j;
2363:   v   = a->a;
2364:   if (usecprow) {
2365:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 6 * mbs));
2366:     mbs  = a->compressedrow.nrows;
2367:     ii   = a->compressedrow.i;
2368:     ridx = a->compressedrow.rindex;
2369:   } else {
2370:     ii = a->i;
2371:     y  = yarray;
2372:     z  = zarray;
2373:   }

2375:   for (i = 0; i < mbs; i++) {
2376:     n = ii[1] - ii[0];
2377:     ii++;
2378:     if (usecprow) {
2379:       z = zarray + 6 * ridx[i];
2380:       y = yarray + 6 * ridx[i];
2381:     }
2382:     sum1 = y[0];
2383:     sum2 = y[1];
2384:     sum3 = y[2];
2385:     sum4 = y[3];
2386:     sum5 = y[4];
2387:     sum6 = y[5];
2388:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2389:     PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2390:     for (j = 0; j < n; j++) {
2391:       xb = x + 6 * (*idx++);
2392:       x1 = xb[0];
2393:       x2 = xb[1];
2394:       x3 = xb[2];
2395:       x4 = xb[3];
2396:       x5 = xb[4];
2397:       x6 = xb[5];
2398:       sum1 += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
2399:       sum2 += v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
2400:       sum3 += v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
2401:       sum4 += v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
2402:       sum5 += v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
2403:       sum6 += v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
2404:       v += 36;
2405:     }
2406:     z[0] = sum1;
2407:     z[1] = sum2;
2408:     z[2] = sum3;
2409:     z[3] = sum4;
2410:     z[4] = sum5;
2411:     z[5] = sum6;
2412:     if (!usecprow) {
2413:       z += 6;
2414:       y += 6;
2415:     }
2416:   }
2417:   PetscCall(VecRestoreArrayRead(xx, &x));
2418:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2419:   PetscCall(PetscLogFlops(72.0 * a->nz));
2420:   PetscFunctionReturn(PETSC_SUCCESS);
2421: }

2423: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
2424: {
2425:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2426:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
2427:   const PetscScalar *x, *xb;
2428:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, *yarray, *zarray;
2429:   const MatScalar   *v;
2430:   PetscInt           mbs = a->mbs, i, j, n;
2431:   const PetscInt    *idx, *ii, *ridx = NULL;
2432:   PetscBool          usecprow = a->compressedrow.use;

2434:   PetscFunctionBegin;
2435:   PetscCall(VecGetArrayRead(xx, &x));
2436:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2438:   idx = a->j;
2439:   v   = a->a;
2440:   if (usecprow) {
2441:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2442:     mbs  = a->compressedrow.nrows;
2443:     ii   = a->compressedrow.i;
2444:     ridx = a->compressedrow.rindex;
2445:   } else {
2446:     ii = a->i;
2447:     y  = yarray;
2448:     z  = zarray;
2449:   }

2451:   for (i = 0; i < mbs; i++) {
2452:     n = ii[1] - ii[0];
2453:     ii++;
2454:     if (usecprow) {
2455:       z = zarray + 7 * ridx[i];
2456:       y = yarray + 7 * ridx[i];
2457:     }
2458:     sum1 = y[0];
2459:     sum2 = y[1];
2460:     sum3 = y[2];
2461:     sum4 = y[3];
2462:     sum5 = y[4];
2463:     sum6 = y[5];
2464:     sum7 = y[6];
2465:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
2466:     PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2467:     for (j = 0; j < n; j++) {
2468:       xb = x + 7 * (*idx++);
2469:       x1 = xb[0];
2470:       x2 = xb[1];
2471:       x3 = xb[2];
2472:       x4 = xb[3];
2473:       x5 = xb[4];
2474:       x6 = xb[5];
2475:       x7 = xb[6];
2476:       sum1 += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
2477:       sum2 += v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
2478:       sum3 += v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
2479:       sum4 += v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
2480:       sum5 += v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
2481:       sum6 += v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
2482:       sum7 += v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
2483:       v += 49;
2484:     }
2485:     z[0] = sum1;
2486:     z[1] = sum2;
2487:     z[2] = sum3;
2488:     z[3] = sum4;
2489:     z[4] = sum5;
2490:     z[5] = sum6;
2491:     z[6] = sum7;
2492:     if (!usecprow) {
2493:       z += 7;
2494:       y += 7;
2495:     }
2496:   }
2497:   PetscCall(VecRestoreArrayRead(xx, &x));
2498:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2499:   PetscCall(PetscLogFlops(98.0 * a->nz));
2500:   PetscFunctionReturn(PETSC_SUCCESS);
2501: }

2503: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2504: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A, Vec xx, Vec yy, Vec zz)
2505: {
2506:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2507:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2508:   const PetscScalar *x, *xb;
2509:   const MatScalar   *v;
2510:   PetscInt           mbs, i, j, n;
2511:   PetscInt           k;
2512:   PetscBool          usecprow = a->compressedrow.use;
2513:   const PetscInt    *idx, *ii, *ridx = NULL, bs = 9, bs2 = 81;

2515:   __m256d a0, a1, a2, a3, a4, a5;
2516:   __m256d w0, w1, w2, w3;
2517:   __m256d z0, z1, z2;
2518:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL << 63);

2520:   PetscFunctionBegin;
2521:   PetscCall(VecCopy(yy, zz));
2522:   PetscCall(VecGetArrayRead(xx, &x));
2523:   PetscCall(VecGetArray(zz, &zarray));

2525:   idx = a->j;
2526:   v   = a->a;
2527:   if (usecprow) {
2528:     mbs  = a->compressedrow.nrows;
2529:     ii   = a->compressedrow.i;
2530:     ridx = a->compressedrow.rindex;
2531:   } else {
2532:     mbs = a->mbs;
2533:     ii  = a->i;
2534:     z   = zarray;
2535:   }

2537:   if (!a->mult_work) {
2538:     k = PetscMax(A->rmap->n, A->cmap->n);
2539:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2540:   }

2542:   work = a->mult_work;
2543:   for (i = 0; i < mbs; i++) {
2544:     n = ii[1] - ii[0];
2545:     ii++;
2546:     workt = work;
2547:     for (j = 0; j < n; j++) {
2548:       xb = x + bs * (*idx++);
2549:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2550:       workt += bs;
2551:     }
2552:     if (usecprow) z = zarray + bs * ridx[i];

2554:     z0 = _mm256_loadu_pd(&z[0]);
2555:     z1 = _mm256_loadu_pd(&z[4]);
2556:     z2 = _mm256_set1_pd(z[8]);

2558:     for (j = 0; j < n; j++) {
2559:       /* first column of a */
2560:       w0 = _mm256_set1_pd(work[j * 9]);
2561:       a0 = _mm256_loadu_pd(&v[j * 81]);
2562:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2563:       a1 = _mm256_loadu_pd(&v[j * 81 + 4]);
2564:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2565:       a2 = _mm256_loadu_pd(&v[j * 81 + 8]);
2566:       z2 = _mm256_fmadd_pd(a2, w0, z2);

2568:       /* second column of a */
2569:       w1 = _mm256_set1_pd(work[j * 9 + 1]);
2570:       a0 = _mm256_loadu_pd(&v[j * 81 + 9]);
2571:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2572:       a1 = _mm256_loadu_pd(&v[j * 81 + 13]);
2573:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2574:       a2 = _mm256_loadu_pd(&v[j * 81 + 17]);
2575:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2577:       /* third column of a */
2578:       w2 = _mm256_set1_pd(work[j * 9 + 2]);
2579:       a3 = _mm256_loadu_pd(&v[j * 81 + 18]);
2580:       z0 = _mm256_fmadd_pd(a3, w2, z0);
2581:       a4 = _mm256_loadu_pd(&v[j * 81 + 22]);
2582:       z1 = _mm256_fmadd_pd(a4, w2, z1);
2583:       a5 = _mm256_loadu_pd(&v[j * 81 + 26]);
2584:       z2 = _mm256_fmadd_pd(a5, w2, z2);

2586:       /* fourth column of a */
2587:       w3 = _mm256_set1_pd(work[j * 9 + 3]);
2588:       a0 = _mm256_loadu_pd(&v[j * 81 + 27]);
2589:       z0 = _mm256_fmadd_pd(a0, w3, z0);
2590:       a1 = _mm256_loadu_pd(&v[j * 81 + 31]);
2591:       z1 = _mm256_fmadd_pd(a1, w3, z1);
2592:       a2 = _mm256_loadu_pd(&v[j * 81 + 35]);
2593:       z2 = _mm256_fmadd_pd(a2, w3, z2);

2595:       /* fifth column of a */
2596:       w0 = _mm256_set1_pd(work[j * 9 + 4]);
2597:       a3 = _mm256_loadu_pd(&v[j * 81 + 36]);
2598:       z0 = _mm256_fmadd_pd(a3, w0, z0);
2599:       a4 = _mm256_loadu_pd(&v[j * 81 + 40]);
2600:       z1 = _mm256_fmadd_pd(a4, w0, z1);
2601:       a5 = _mm256_loadu_pd(&v[j * 81 + 44]);
2602:       z2 = _mm256_fmadd_pd(a5, w0, z2);

2604:       /* sixth column of a */
2605:       w1 = _mm256_set1_pd(work[j * 9 + 5]);
2606:       a0 = _mm256_loadu_pd(&v[j * 81 + 45]);
2607:       z0 = _mm256_fmadd_pd(a0, w1, z0);
2608:       a1 = _mm256_loadu_pd(&v[j * 81 + 49]);
2609:       z1 = _mm256_fmadd_pd(a1, w1, z1);
2610:       a2 = _mm256_loadu_pd(&v[j * 81 + 53]);
2611:       z2 = _mm256_fmadd_pd(a2, w1, z2);

2613:       /* seventh column of a */
2614:       w2 = _mm256_set1_pd(work[j * 9 + 6]);
2615:       a0 = _mm256_loadu_pd(&v[j * 81 + 54]);
2616:       z0 = _mm256_fmadd_pd(a0, w2, z0);
2617:       a1 = _mm256_loadu_pd(&v[j * 81 + 58]);
2618:       z1 = _mm256_fmadd_pd(a1, w2, z1);
2619:       a2 = _mm256_loadu_pd(&v[j * 81 + 62]);
2620:       z2 = _mm256_fmadd_pd(a2, w2, z2);

2622:       /* eighth column of a */
2623:       w3 = _mm256_set1_pd(work[j * 9 + 7]);
2624:       a3 = _mm256_loadu_pd(&v[j * 81 + 63]);
2625:       z0 = _mm256_fmadd_pd(a3, w3, z0);
2626:       a4 = _mm256_loadu_pd(&v[j * 81 + 67]);
2627:       z1 = _mm256_fmadd_pd(a4, w3, z1);
2628:       a5 = _mm256_loadu_pd(&v[j * 81 + 71]);
2629:       z2 = _mm256_fmadd_pd(a5, w3, z2);

2631:       /* ninth column of a */
2632:       w0 = _mm256_set1_pd(work[j * 9 + 8]);
2633:       a0 = _mm256_loadu_pd(&v[j * 81 + 72]);
2634:       z0 = _mm256_fmadd_pd(a0, w0, z0);
2635:       a1 = _mm256_loadu_pd(&v[j * 81 + 76]);
2636:       z1 = _mm256_fmadd_pd(a1, w0, z1);
2637:       a2 = _mm256_maskload_pd(&v[j * 81 + 80], mask1);
2638:       z2 = _mm256_fmadd_pd(a2, w0, z2);
2639:     }

2641:     _mm256_storeu_pd(&z[0], z0);
2642:     _mm256_storeu_pd(&z[4], z1);
2643:     _mm256_maskstore_pd(&z[8], mask1, z2);

2645:     v += n * bs2;
2646:     if (!usecprow) z += bs;
2647:   }
2648:   PetscCall(VecRestoreArrayRead(xx, &x));
2649:   PetscCall(VecRestoreArray(zz, &zarray));
2650:   PetscCall(PetscLogFlops(162.0 * a->nz));
2651:   PetscFunctionReturn(PETSC_SUCCESS);
2652: }
2653: #endif

2655: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A, Vec xx, Vec yy, Vec zz)
2656: {
2657:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2658:   PetscScalar       *y = NULL, *z = NULL, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8, sum9, sum10, sum11;
2659:   const PetscScalar *x, *xb;
2660:   PetscScalar        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, *yarray, *zarray;
2661:   const MatScalar   *v;
2662:   PetscInt           mbs = a->mbs, i, j, n;
2663:   const PetscInt    *idx, *ii, *ridx = NULL;
2664:   PetscBool          usecprow = a->compressedrow.use;

2666:   PetscFunctionBegin;
2667:   PetscCall(VecGetArrayRead(xx, &x));
2668:   PetscCall(VecGetArrayPair(yy, zz, &yarray, &zarray));

2670:   idx = a->j;
2671:   v   = a->a;
2672:   if (usecprow) {
2673:     if (zz != yy) PetscCall(PetscArraycpy(zarray, yarray, 7 * mbs));
2674:     mbs  = a->compressedrow.nrows;
2675:     ii   = a->compressedrow.i;
2676:     ridx = a->compressedrow.rindex;
2677:   } else {
2678:     ii = a->i;
2679:     y  = yarray;
2680:     z  = zarray;
2681:   }

2683:   for (i = 0; i < mbs; i++) {
2684:     n = ii[1] - ii[0];
2685:     ii++;
2686:     if (usecprow) {
2687:       z = zarray + 11 * ridx[i];
2688:       y = yarray + 11 * ridx[i];
2689:     }
2690:     sum1  = y[0];
2691:     sum2  = y[1];
2692:     sum3  = y[2];
2693:     sum4  = y[3];
2694:     sum5  = y[4];
2695:     sum6  = y[5];
2696:     sum7  = y[6];
2697:     sum8  = y[7];
2698:     sum9  = y[8];
2699:     sum10 = y[9];
2700:     sum11 = y[10];
2701:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);           /* Indices for the next row (assumes same size as this one) */
2702:     PetscPrefetchBlock(v + 121 * n, 121 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2703:     for (j = 0; j < n; j++) {
2704:       xb  = x + 11 * (*idx++);
2705:       x1  = xb[0];
2706:       x2  = xb[1];
2707:       x3  = xb[2];
2708:       x4  = xb[3];
2709:       x5  = xb[4];
2710:       x6  = xb[5];
2711:       x7  = xb[6];
2712:       x8  = xb[7];
2713:       x9  = xb[8];
2714:       x10 = xb[9];
2715:       x11 = xb[10];
2716:       sum1 += v[0] * x1 + v[11] * x2 + v[2 * 11] * x3 + v[3 * 11] * x4 + v[4 * 11] * x5 + v[5 * 11] * x6 + v[6 * 11] * x7 + v[7 * 11] * x8 + v[8 * 11] * x9 + v[9 * 11] * x10 + v[10 * 11] * x11;
2717:       sum2 += v[1 + 0] * x1 + v[1 + 11] * x2 + v[1 + 2 * 11] * x3 + v[1 + 3 * 11] * x4 + v[1 + 4 * 11] * x5 + v[1 + 5 * 11] * x6 + v[1 + 6 * 11] * x7 + v[1 + 7 * 11] * x8 + v[1 + 8 * 11] * x9 + v[1 + 9 * 11] * x10 + v[1 + 10 * 11] * x11;
2718:       sum3 += v[2 + 0] * x1 + v[2 + 11] * x2 + v[2 + 2 * 11] * x3 + v[2 + 3 * 11] * x4 + v[2 + 4 * 11] * x5 + v[2 + 5 * 11] * x6 + v[2 + 6 * 11] * x7 + v[2 + 7 * 11] * x8 + v[2 + 8 * 11] * x9 + v[2 + 9 * 11] * x10 + v[2 + 10 * 11] * x11;
2719:       sum4 += v[3 + 0] * x1 + v[3 + 11] * x2 + v[3 + 2 * 11] * x3 + v[3 + 3 * 11] * x4 + v[3 + 4 * 11] * x5 + v[3 + 5 * 11] * x6 + v[3 + 6 * 11] * x7 + v[3 + 7 * 11] * x8 + v[3 + 8 * 11] * x9 + v[3 + 9 * 11] * x10 + v[3 + 10 * 11] * x11;
2720:       sum5 += v[4 + 0] * x1 + v[4 + 11] * x2 + v[4 + 2 * 11] * x3 + v[4 + 3 * 11] * x4 + v[4 + 4 * 11] * x5 + v[4 + 5 * 11] * x6 + v[4 + 6 * 11] * x7 + v[4 + 7 * 11] * x8 + v[4 + 8 * 11] * x9 + v[4 + 9 * 11] * x10 + v[4 + 10 * 11] * x11;
2721:       sum6 += v[5 + 0] * x1 + v[5 + 11] * x2 + v[5 + 2 * 11] * x3 + v[5 + 3 * 11] * x4 + v[5 + 4 * 11] * x5 + v[5 + 5 * 11] * x6 + v[5 + 6 * 11] * x7 + v[5 + 7 * 11] * x8 + v[5 + 8 * 11] * x9 + v[5 + 9 * 11] * x10 + v[5 + 10 * 11] * x11;
2722:       sum7 += v[6 + 0] * x1 + v[6 + 11] * x2 + v[6 + 2 * 11] * x3 + v[6 + 3 * 11] * x4 + v[6 + 4 * 11] * x5 + v[6 + 5 * 11] * x6 + v[6 + 6 * 11] * x7 + v[6 + 7 * 11] * x8 + v[6 + 8 * 11] * x9 + v[6 + 9 * 11] * x10 + v[6 + 10 * 11] * x11;
2723:       sum8 += v[7 + 0] * x1 + v[7 + 11] * x2 + v[7 + 2 * 11] * x3 + v[7 + 3 * 11] * x4 + v[7 + 4 * 11] * x5 + v[7 + 5 * 11] * x6 + v[7 + 6 * 11] * x7 + v[7 + 7 * 11] * x8 + v[7 + 8 * 11] * x9 + v[7 + 9 * 11] * x10 + v[7 + 10 * 11] * x11;
2724:       sum9 += v[8 + 0] * x1 + v[8 + 11] * x2 + v[8 + 2 * 11] * x3 + v[8 + 3 * 11] * x4 + v[8 + 4 * 11] * x5 + v[8 + 5 * 11] * x6 + v[8 + 6 * 11] * x7 + v[8 + 7 * 11] * x8 + v[8 + 8 * 11] * x9 + v[8 + 9 * 11] * x10 + v[8 + 10 * 11] * x11;
2725:       sum10 += v[9 + 0] * x1 + v[9 + 11] * x2 + v[9 + 2 * 11] * x3 + v[9 + 3 * 11] * x4 + v[9 + 4 * 11] * x5 + v[9 + 5 * 11] * x6 + v[9 + 6 * 11] * x7 + v[9 + 7 * 11] * x8 + v[9 + 8 * 11] * x9 + v[9 + 9 * 11] * x10 + v[9 + 10 * 11] * x11;
2726:       sum11 += v[10 + 0] * x1 + v[10 + 11] * x2 + v[10 + 2 * 11] * x3 + v[10 + 3 * 11] * x4 + v[10 + 4 * 11] * x5 + v[10 + 5 * 11] * x6 + v[10 + 6 * 11] * x7 + v[10 + 7 * 11] * x8 + v[10 + 8 * 11] * x9 + v[10 + 9 * 11] * x10 + v[10 + 10 * 11] * x11;
2727:       v += 121;
2728:     }
2729:     z[0]  = sum1;
2730:     z[1]  = sum2;
2731:     z[2]  = sum3;
2732:     z[3]  = sum4;
2733:     z[4]  = sum5;
2734:     z[5]  = sum6;
2735:     z[6]  = sum7;
2736:     z[7]  = sum8;
2737:     z[8]  = sum9;
2738:     z[9]  = sum10;
2739:     z[10] = sum11;
2740:     if (!usecprow) {
2741:       z += 11;
2742:       y += 11;
2743:     }
2744:   }
2745:   PetscCall(VecRestoreArrayRead(xx, &x));
2746:   PetscCall(VecRestoreArrayPair(yy, zz, &yarray, &zarray));
2747:   PetscCall(PetscLogFlops(242.0 * a->nz));
2748:   PetscFunctionReturn(PETSC_SUCCESS);
2749: }

2751: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
2752: {
2753:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2754:   PetscScalar       *z = NULL, *work, *workt, *zarray;
2755:   const PetscScalar *x, *xb;
2756:   const MatScalar   *v;
2757:   PetscInt           mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2758:   PetscInt           ncols, k;
2759:   const PetscInt    *ridx     = NULL, *idx, *ii;
2760:   PetscBool          usecprow = a->compressedrow.use;

2762:   PetscFunctionBegin;
2763:   PetscCall(VecCopy(yy, zz));
2764:   PetscCall(VecGetArrayRead(xx, &x));
2765:   PetscCall(VecGetArray(zz, &zarray));

2767:   idx = a->j;
2768:   v   = a->a;
2769:   if (usecprow) {
2770:     mbs  = a->compressedrow.nrows;
2771:     ii   = a->compressedrow.i;
2772:     ridx = a->compressedrow.rindex;
2773:   } else {
2774:     mbs = a->mbs;
2775:     ii  = a->i;
2776:     z   = zarray;
2777:   }

2779:   if (!a->mult_work) {
2780:     k = PetscMax(A->rmap->n, A->cmap->n);
2781:     PetscCall(PetscMalloc1(k + 1, &a->mult_work));
2782:   }
2783:   work = a->mult_work;
2784:   for (i = 0; i < mbs; i++) {
2785:     n = ii[1] - ii[0];
2786:     ii++;
2787:     ncols = n * bs;
2788:     workt = work;
2789:     for (j = 0; j < n; j++) {
2790:       xb = x + bs * (*idx++);
2791:       for (k = 0; k < bs; k++) workt[k] = xb[k];
2792:       workt += bs;
2793:     }
2794:     if (usecprow) z = zarray + bs * ridx[i];
2795:     PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
2796:     v += n * bs2;
2797:     if (!usecprow) z += bs;
2798:   }
2799:   PetscCall(VecRestoreArrayRead(xx, &x));
2800:   PetscCall(VecRestoreArray(zz, &zarray));
2801:   PetscCall(PetscLogFlops(2.0 * a->nz * bs2));
2802:   PetscFunctionReturn(PETSC_SUCCESS);
2803: }

2805: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2806: {
2807:   PetscScalar zero = 0.0;

2809:   PetscFunctionBegin;
2810:   PetscCall(VecSet(zz, zero));
2811:   PetscCall(MatMultHermitianTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2812:   PetscFunctionReturn(PETSC_SUCCESS);
2813: }

2815: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A, Vec xx, Vec zz)
2816: {
2817:   PetscScalar zero = 0.0;

2819:   PetscFunctionBegin;
2820:   PetscCall(VecSet(zz, zero));
2821:   PetscCall(MatMultTransposeAdd_SeqBAIJ(A, xx, zz, zz));
2822:   PetscFunctionReturn(PETSC_SUCCESS);
2823: }

2825: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2826: {
2827:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2828:   PetscScalar       *z, x1, x2, x3, x4, x5;
2829:   const PetscScalar *x, *xb = NULL;
2830:   const MatScalar   *v;
2831:   PetscInt           mbs, i, rval, bs     = A->rmap->bs, j, n;
2832:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2833:   Mat_CompressedRow  cprow    = a->compressedrow;
2834:   PetscBool          usecprow = cprow.use;

2836:   PetscFunctionBegin;
2837:   if (yy != zz) PetscCall(VecCopy(yy, zz));
2838:   PetscCall(VecGetArrayRead(xx, &x));
2839:   PetscCall(VecGetArray(zz, &z));

2841:   idx = a->j;
2842:   v   = a->a;
2843:   if (usecprow) {
2844:     mbs  = cprow.nrows;
2845:     ii   = cprow.i;
2846:     ridx = cprow.rindex;
2847:   } else {
2848:     mbs = a->mbs;
2849:     ii  = a->i;
2850:     xb  = x;
2851:   }

2853:   switch (bs) {
2854:   case 1:
2855:     for (i = 0; i < mbs; i++) {
2856:       if (usecprow) xb = x + ridx[i];
2857:       x1 = xb[0];
2858:       ib = idx + ii[0];
2859:       n  = ii[1] - ii[0];
2860:       ii++;
2861:       for (j = 0; j < n; j++) {
2862:         rval = ib[j];
2863:         z[rval] += PetscConj(*v) * x1;
2864:         v++;
2865:       }
2866:       if (!usecprow) xb++;
2867:     }
2868:     break;
2869:   case 2:
2870:     for (i = 0; i < mbs; i++) {
2871:       if (usecprow) xb = x + 2 * ridx[i];
2872:       x1 = xb[0];
2873:       x2 = xb[1];
2874:       ib = idx + ii[0];
2875:       n  = ii[1] - ii[0];
2876:       ii++;
2877:       for (j = 0; j < n; j++) {
2878:         rval = ib[j] * 2;
2879:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2;
2880:         z[rval++] += PetscConj(v[2]) * x1 + PetscConj(v[3]) * x2;
2881:         v += 4;
2882:       }
2883:       if (!usecprow) xb += 2;
2884:     }
2885:     break;
2886:   case 3:
2887:     for (i = 0; i < mbs; i++) {
2888:       if (usecprow) xb = x + 3 * ridx[i];
2889:       x1 = xb[0];
2890:       x2 = xb[1];
2891:       x3 = xb[2];
2892:       ib = idx + ii[0];
2893:       n  = ii[1] - ii[0];
2894:       ii++;
2895:       for (j = 0; j < n; j++) {
2896:         rval = ib[j] * 3;
2897:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3;
2898:         z[rval++] += PetscConj(v[3]) * x1 + PetscConj(v[4]) * x2 + PetscConj(v[5]) * x3;
2899:         z[rval++] += PetscConj(v[6]) * x1 + PetscConj(v[7]) * x2 + PetscConj(v[8]) * x3;
2900:         v += 9;
2901:       }
2902:       if (!usecprow) xb += 3;
2903:     }
2904:     break;
2905:   case 4:
2906:     for (i = 0; i < mbs; i++) {
2907:       if (usecprow) xb = x + 4 * ridx[i];
2908:       x1 = xb[0];
2909:       x2 = xb[1];
2910:       x3 = xb[2];
2911:       x4 = xb[3];
2912:       ib = idx + ii[0];
2913:       n  = ii[1] - ii[0];
2914:       ii++;
2915:       for (j = 0; j < n; j++) {
2916:         rval = ib[j] * 4;
2917:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4;
2918:         z[rval++] += PetscConj(v[4]) * x1 + PetscConj(v[5]) * x2 + PetscConj(v[6]) * x3 + PetscConj(v[7]) * x4;
2919:         z[rval++] += PetscConj(v[8]) * x1 + PetscConj(v[9]) * x2 + PetscConj(v[10]) * x3 + PetscConj(v[11]) * x4;
2920:         z[rval++] += PetscConj(v[12]) * x1 + PetscConj(v[13]) * x2 + PetscConj(v[14]) * x3 + PetscConj(v[15]) * x4;
2921:         v += 16;
2922:       }
2923:       if (!usecprow) xb += 4;
2924:     }
2925:     break;
2926:   case 5:
2927:     for (i = 0; i < mbs; i++) {
2928:       if (usecprow) xb = x + 5 * ridx[i];
2929:       x1 = xb[0];
2930:       x2 = xb[1];
2931:       x3 = xb[2];
2932:       x4 = xb[3];
2933:       x5 = xb[4];
2934:       ib = idx + ii[0];
2935:       n  = ii[1] - ii[0];
2936:       ii++;
2937:       for (j = 0; j < n; j++) {
2938:         rval = ib[j] * 5;
2939:         z[rval++] += PetscConj(v[0]) * x1 + PetscConj(v[1]) * x2 + PetscConj(v[2]) * x3 + PetscConj(v[3]) * x4 + PetscConj(v[4]) * x5;
2940:         z[rval++] += PetscConj(v[5]) * x1 + PetscConj(v[6]) * x2 + PetscConj(v[7]) * x3 + PetscConj(v[8]) * x4 + PetscConj(v[9]) * x5;
2941:         z[rval++] += PetscConj(v[10]) * x1 + PetscConj(v[11]) * x2 + PetscConj(v[12]) * x3 + PetscConj(v[13]) * x4 + PetscConj(v[14]) * x5;
2942:         z[rval++] += PetscConj(v[15]) * x1 + PetscConj(v[16]) * x2 + PetscConj(v[17]) * x3 + PetscConj(v[18]) * x4 + PetscConj(v[19]) * x5;
2943:         z[rval++] += PetscConj(v[20]) * x1 + PetscConj(v[21]) * x2 + PetscConj(v[22]) * x3 + PetscConj(v[23]) * x4 + PetscConj(v[24]) * x5;
2944:         v += 25;
2945:       }
2946:       if (!usecprow) xb += 5;
2947:     }
2948:     break;
2949:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2950:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "block size larger than 5 is not supported yet");
2951: #if 0
2952:     {
2953:       PetscInt          ncols,k,bs2=a->bs2;
2954:       PetscScalar       *work,*workt,zb;
2955:       const PetscScalar *xtmp;
2956:       if (!a->mult_work) {
2957:         k    = PetscMax(A->rmap->n,A->cmap->n);
2958:         PetscCall(PetscMalloc1(k+1,&a->mult_work));
2959:       }
2960:       work = a->mult_work;
2961:       xtmp = x;
2962:       for (i=0; i<mbs; i++) {
2963:         n     = ii[1] - ii[0]; ii++;
2964:         ncols = n*bs;
2965:         PetscCall(PetscArrayzero(work,ncols));
2966:         if (usecprow) xtmp = x + bs*ridx[i];
2967:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2968:         v += n*bs2;
2969:         if (!usecprow) xtmp += bs;
2970:         workt = work;
2971:         for (j=0; j<n; j++) {
2972:           zb = z + bs*(*idx++);
2973:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2974:           workt += bs;
2975:         }
2976:       }
2977:     }
2978: #endif
2979:   }
2980:   PetscCall(VecRestoreArrayRead(xx, &x));
2981:   PetscCall(VecRestoreArray(zz, &z));
2982:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
2983:   PetscFunctionReturn(PETSC_SUCCESS);
2984: }

2986: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A, Vec xx, Vec yy, Vec zz)
2987: {
2988:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2989:   PetscScalar       *zb, *z, x1, x2, x3, x4, x5;
2990:   const PetscScalar *x, *xb = NULL;
2991:   const MatScalar   *v;
2992:   PetscInt           mbs, i, rval, bs = A->rmap->bs, j, n, bs2 = a->bs2;
2993:   const PetscInt    *idx, *ii, *ib, *ridx = NULL;
2994:   Mat_CompressedRow  cprow    = a->compressedrow;
2995:   PetscBool          usecprow = cprow.use;

2997:   PetscFunctionBegin;
2998:   if (yy != zz) PetscCall(VecCopy(yy, zz));
2999:   PetscCall(VecGetArrayRead(xx, &x));
3000:   PetscCall(VecGetArray(zz, &z));

3002:   idx = a->j;
3003:   v   = a->a;
3004:   if (usecprow) {
3005:     mbs  = cprow.nrows;
3006:     ii   = cprow.i;
3007:     ridx = cprow.rindex;
3008:   } else {
3009:     mbs = a->mbs;
3010:     ii  = a->i;
3011:     xb  = x;
3012:   }

3014:   switch (bs) {
3015:   case 1:
3016:     for (i = 0; i < mbs; i++) {
3017:       if (usecprow) xb = x + ridx[i];
3018:       x1 = xb[0];
3019:       ib = idx + ii[0];
3020:       n  = ii[1] - ii[0];
3021:       ii++;
3022:       for (j = 0; j < n; j++) {
3023:         rval = ib[j];
3024:         z[rval] += *v * x1;
3025:         v++;
3026:       }
3027:       if (!usecprow) xb++;
3028:     }
3029:     break;
3030:   case 2:
3031:     for (i = 0; i < mbs; i++) {
3032:       if (usecprow) xb = x + 2 * ridx[i];
3033:       x1 = xb[0];
3034:       x2 = xb[1];
3035:       ib = idx + ii[0];
3036:       n  = ii[1] - ii[0];
3037:       ii++;
3038:       for (j = 0; j < n; j++) {
3039:         rval = ib[j] * 2;
3040:         z[rval++] += v[0] * x1 + v[1] * x2;
3041:         z[rval++] += v[2] * x1 + v[3] * x2;
3042:         v += 4;
3043:       }
3044:       if (!usecprow) xb += 2;
3045:     }
3046:     break;
3047:   case 3:
3048:     for (i = 0; i < mbs; i++) {
3049:       if (usecprow) xb = x + 3 * ridx[i];
3050:       x1 = xb[0];
3051:       x2 = xb[1];
3052:       x3 = xb[2];
3053:       ib = idx + ii[0];
3054:       n  = ii[1] - ii[0];
3055:       ii++;
3056:       for (j = 0; j < n; j++) {
3057:         rval = ib[j] * 3;
3058:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3;
3059:         z[rval++] += v[3] * x1 + v[4] * x2 + v[5] * x3;
3060:         z[rval++] += v[6] * x1 + v[7] * x2 + v[8] * x3;
3061:         v += 9;
3062:       }
3063:       if (!usecprow) xb += 3;
3064:     }
3065:     break;
3066:   case 4:
3067:     for (i = 0; i < mbs; i++) {
3068:       if (usecprow) xb = x + 4 * ridx[i];
3069:       x1 = xb[0];
3070:       x2 = xb[1];
3071:       x3 = xb[2];
3072:       x4 = xb[3];
3073:       ib = idx + ii[0];
3074:       n  = ii[1] - ii[0];
3075:       ii++;
3076:       for (j = 0; j < n; j++) {
3077:         rval = ib[j] * 4;
3078:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
3079:         z[rval++] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
3080:         z[rval++] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
3081:         z[rval++] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
3082:         v += 16;
3083:       }
3084:       if (!usecprow) xb += 4;
3085:     }
3086:     break;
3087:   case 5:
3088:     for (i = 0; i < mbs; i++) {
3089:       if (usecprow) xb = x + 5 * ridx[i];
3090:       x1 = xb[0];
3091:       x2 = xb[1];
3092:       x3 = xb[2];
3093:       x4 = xb[3];
3094:       x5 = xb[4];
3095:       ib = idx + ii[0];
3096:       n  = ii[1] - ii[0];
3097:       ii++;
3098:       for (j = 0; j < n; j++) {
3099:         rval = ib[j] * 5;
3100:         z[rval++] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
3101:         z[rval++] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
3102:         z[rval++] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
3103:         z[rval++] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
3104:         z[rval++] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
3105:         v += 25;
3106:       }
3107:       if (!usecprow) xb += 5;
3108:     }
3109:     break;
3110:   default: { /* block sizes larger then 5 by 5 are handled by BLAS */
3111:     PetscInt           ncols, k;
3112:     PetscScalar       *work, *workt;
3113:     const PetscScalar *xtmp;
3114:     if (!a->mult_work) {
3115:       k = PetscMax(A->rmap->n, A->cmap->n);
3116:       PetscCall(PetscMalloc1(k + 1, &a->mult_work));
3117:     }
3118:     work = a->mult_work;
3119:     xtmp = x;
3120:     for (i = 0; i < mbs; i++) {
3121:       n = ii[1] - ii[0];
3122:       ii++;
3123:       ncols = n * bs;
3124:       PetscCall(PetscArrayzero(work, ncols));
3125:       if (usecprow) xtmp = x + bs * ridx[i];
3126:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, xtmp, v, work);
3127:       v += n * bs2;
3128:       if (!usecprow) xtmp += bs;
3129:       workt = work;
3130:       for (j = 0; j < n; j++) {
3131:         zb = z + bs * (*idx++);
3132:         for (k = 0; k < bs; k++) zb[k] += workt[k];
3133:         workt += bs;
3134:       }
3135:     }
3136:   }
3137:   }
3138:   PetscCall(VecRestoreArrayRead(xx, &x));
3139:   PetscCall(VecRestoreArray(zz, &z));
3140:   PetscCall(PetscLogFlops(2.0 * a->nz * a->bs2));
3141:   PetscFunctionReturn(PETSC_SUCCESS);
3142: }

3144: PetscErrorCode MatScale_SeqBAIJ(Mat inA, PetscScalar alpha)
3145: {
3146:   Mat_SeqBAIJ *a       = (Mat_SeqBAIJ *)inA->data;
3147:   PetscInt     totalnz = a->bs2 * a->nz;
3148:   PetscScalar  oalpha  = alpha;
3149:   PetscBLASInt one     = 1, tnz;

3151:   PetscFunctionBegin;
3152:   PetscCall(PetscBLASIntCast(totalnz, &tnz));
3153:   PetscCallBLAS("BLASscal", BLASscal_(&tnz, &oalpha, a->a, &one));
3154:   PetscCall(PetscLogFlops(totalnz));
3155:   PetscFunctionReturn(PETSC_SUCCESS);
3156: }

3158: PetscErrorCode MatNorm_SeqBAIJ(Mat A, NormType type, PetscReal *norm)
3159: {
3160:   Mat_SeqBAIJ *a   = (Mat_SeqBAIJ *)A->data;
3161:   MatScalar   *v   = a->a;
3162:   PetscReal    sum = 0.0;
3163:   PetscInt     i, j, k, bs = A->rmap->bs, nz = a->nz, bs2 = a->bs2, k1;

3165:   PetscFunctionBegin;
3166:   if (type == NORM_FROBENIUS) {
3167: #if defined(PETSC_USE_REAL___FP16)
3168:     PetscBLASInt one = 1, cnt = bs2 * nz;
3169:     PetscCallBLAS("BLASnrm2", *norm = BLASnrm2_(&cnt, v, &one));
3170: #else
3171:     for (i = 0; i < bs2 * nz; i++) {
3172:       sum += PetscRealPart(PetscConj(*v) * (*v));
3173:       v++;
3174:     }
3175: #endif
3176:     *norm = PetscSqrtReal(sum);
3177:     PetscCall(PetscLogFlops(2.0 * bs2 * nz));
3178:   } else if (type == NORM_1) { /* maximum column sum */
3179:     PetscReal *tmp;
3180:     PetscInt  *bcol = a->j;
3181:     PetscCall(PetscCalloc1(A->cmap->n + 1, &tmp));
3182:     for (i = 0; i < nz; i++) {
3183:       for (j = 0; j < bs; j++) {
3184:         k1 = bs * (*bcol) + j; /* column index */
3185:         for (k = 0; k < bs; k++) {
3186:           tmp[k1] += PetscAbsScalar(*v);
3187:           v++;
3188:         }
3189:       }
3190:       bcol++;
3191:     }
3192:     *norm = 0.0;
3193:     for (j = 0; j < A->cmap->n; j++) {
3194:       if (tmp[j] > *norm) *norm = tmp[j];
3195:     }
3196:     PetscCall(PetscFree(tmp));
3197:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3198:   } else if (type == NORM_INFINITY) { /* maximum row sum */
3199:     *norm = 0.0;
3200:     for (k = 0; k < bs; k++) {
3201:       for (j = 0; j < a->mbs; j++) {
3202:         v   = a->a + bs2 * a->i[j] + k;
3203:         sum = 0.0;
3204:         for (i = 0; i < a->i[j + 1] - a->i[j]; i++) {
3205:           for (k1 = 0; k1 < bs; k1++) {
3206:             sum += PetscAbsScalar(*v);
3207:             v += bs;
3208:           }
3209:         }
3210:         if (sum > *norm) *norm = sum;
3211:       }
3212:     }
3213:     PetscCall(PetscLogFlops(PetscMax(bs2 * nz - 1, 0)));
3214:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
3215:   PetscFunctionReturn(PETSC_SUCCESS);
3216: }

3218: PetscErrorCode MatEqual_SeqBAIJ(Mat A, Mat B, PetscBool *flg)
3219: {
3220:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)B->data;

3222:   PetscFunctionBegin;
3223:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
3224:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
3225:     *flg = PETSC_FALSE;
3226:     PetscFunctionReturn(PETSC_SUCCESS);
3227:   }

3229:   /* if the a->i are the same */
3230:   PetscCall(PetscArraycmp(a->i, b->i, a->mbs + 1, flg));
3231:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3233:   /* if a->j are the same */
3234:   PetscCall(PetscArraycmp(a->j, b->j, a->nz, flg));
3235:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);

3237:   /* if a->a are the same */
3238:   PetscCall(PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (B->rmap->bs), flg));
3239:   PetscFunctionReturn(PETSC_SUCCESS);
3240: }

3242: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A, Vec v)
3243: {
3244:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3245:   PetscInt     i, j, k, n, row, bs, *ai, *aj, ambs, bs2;
3246:   PetscScalar *x, zero = 0.0;
3247:   MatScalar   *aa, *aa_j;

3249:   PetscFunctionBegin;
3250:   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
3251:   bs   = A->rmap->bs;
3252:   aa   = a->a;
3253:   ai   = a->i;
3254:   aj   = a->j;
3255:   ambs = a->mbs;
3256:   bs2  = a->bs2;

3258:   PetscCall(VecSet(v, zero));
3259:   PetscCall(VecGetArray(v, &x));
3260:   PetscCall(VecGetLocalSize(v, &n));
3261:   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
3262:   for (i = 0; i < ambs; i++) {
3263:     for (j = ai[i]; j < ai[i + 1]; j++) {
3264:       if (aj[j] == i) {
3265:         row  = i * bs;
3266:         aa_j = aa + j * bs2;
3267:         for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
3268:         break;
3269:       }
3270:     }
3271:   }
3272:   PetscCall(VecRestoreArray(v, &x));
3273:   PetscFunctionReturn(PETSC_SUCCESS);
3274: }

3276: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A, Vec ll, Vec rr)
3277: {
3278:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3279:   const PetscScalar *l, *r, *li, *ri;
3280:   PetscScalar        x;
3281:   MatScalar         *aa, *v;
3282:   PetscInt           i, j, k, lm, rn, M, m, n, mbs, tmp, bs, bs2, iai;
3283:   const PetscInt    *ai, *aj;

3285:   PetscFunctionBegin;
3286:   ai  = a->i;
3287:   aj  = a->j;
3288:   aa  = a->a;
3289:   m   = A->rmap->n;
3290:   n   = A->cmap->n;
3291:   bs  = A->rmap->bs;
3292:   mbs = a->mbs;
3293:   bs2 = a->bs2;
3294:   if (ll) {
3295:     PetscCall(VecGetArrayRead(ll, &l));
3296:     PetscCall(VecGetLocalSize(ll, &lm));
3297:     PetscCheck(lm == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
3298:     for (i = 0; i < mbs; i++) { /* for each block row */
3299:       M  = ai[i + 1] - ai[i];
3300:       li = l + i * bs;
3301:       v  = aa + bs2 * ai[i];
3302:       for (j = 0; j < M; j++) { /* for each block */
3303:         for (k = 0; k < bs2; k++) (*v++) *= li[k % bs];
3304:       }
3305:     }
3306:     PetscCall(VecRestoreArrayRead(ll, &l));
3307:     PetscCall(PetscLogFlops(a->nz));
3308:   }

3310:   if (rr) {
3311:     PetscCall(VecGetArrayRead(rr, &r));
3312:     PetscCall(VecGetLocalSize(rr, &rn));
3313:     PetscCheck(rn == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
3314:     for (i = 0; i < mbs; i++) { /* for each block row */
3315:       iai = ai[i];
3316:       M   = ai[i + 1] - iai;
3317:       v   = aa + bs2 * iai;
3318:       for (j = 0; j < M; j++) { /* for each block */
3319:         ri = r + bs * aj[iai + j];
3320:         for (k = 0; k < bs; k++) {
3321:           x = ri[k];
3322:           for (tmp = 0; tmp < bs; tmp++) v[tmp] *= x;
3323:           v += bs;
3324:         }
3325:       }
3326:     }
3327:     PetscCall(VecRestoreArrayRead(rr, &r));
3328:     PetscCall(PetscLogFlops(a->nz));
3329:   }
3330:   PetscFunctionReturn(PETSC_SUCCESS);
3331: }

3333: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A, MatInfoType flag, MatInfo *info)
3334: {
3335:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3337:   PetscFunctionBegin;
3338:   info->block_size   = a->bs2;
3339:   info->nz_allocated = a->bs2 * a->maxnz;
3340:   info->nz_used      = a->bs2 * a->nz;
3341:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
3342:   info->assemblies   = A->num_ass;
3343:   info->mallocs      = A->info.mallocs;
3344:   info->memory       = 0; /* REVIEW ME */
3345:   if (A->factortype) {
3346:     info->fill_ratio_given  = A->info.fill_ratio_given;
3347:     info->fill_ratio_needed = A->info.fill_ratio_needed;
3348:     info->factor_mallocs    = A->info.factor_mallocs;
3349:   } else {
3350:     info->fill_ratio_given  = 0;
3351:     info->fill_ratio_needed = 0;
3352:     info->factor_mallocs    = 0;
3353:   }
3354:   PetscFunctionReturn(PETSC_SUCCESS);
3355: }

3357: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
3358: {
3359:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;

3361:   PetscFunctionBegin;
3362:   PetscCall(PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]));
3363:   PetscFunctionReturn(PETSC_SUCCESS);
3364: }

3366: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
3367: {
3368:   PetscFunctionBegin;
3369:   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
3370:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
3371:   PetscFunctionReturn(PETSC_SUCCESS);
3372: }

3374: static PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3375: {
3376:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3377:   PetscScalar       *z = NULL, sum1;
3378:   const PetscScalar *xb;
3379:   PetscScalar        x1;
3380:   const MatScalar   *v, *vv;
3381:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3382:   PetscBool          usecprow = a->compressedrow.use;

3384:   PetscFunctionBegin;
3385:   idx = a->j;
3386:   v   = a->a;
3387:   if (usecprow) {
3388:     mbs  = a->compressedrow.nrows;
3389:     ii   = a->compressedrow.i;
3390:     ridx = a->compressedrow.rindex;
3391:   } else {
3392:     mbs = a->mbs;
3393:     ii  = a->i;
3394:     z   = c;
3395:   }

3397:   for (i = 0; i < mbs; i++) {
3398:     n = ii[1] - ii[0];
3399:     ii++;
3400:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3401:     PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
3402:     if (usecprow) z = c + ridx[i];
3403:     jj = idx;
3404:     vv = v;
3405:     for (k = 0; k < cn; k++) {
3406:       idx  = jj;
3407:       v    = vv;
3408:       sum1 = 0.0;
3409:       for (j = 0; j < n; j++) {
3410:         xb = b + (*idx++);
3411:         x1 = xb[0 + k * bm];
3412:         sum1 += v[0] * x1;
3413:         v += 1;
3414:       }
3415:       z[0 + k * cm] = sum1;
3416:     }
3417:     if (!usecprow) z += 1;
3418:   }
3419:   PetscFunctionReturn(PETSC_SUCCESS);
3420: }

3422: static PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3423: {
3424:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3425:   PetscScalar       *z = NULL, sum1, sum2;
3426:   const PetscScalar *xb;
3427:   PetscScalar        x1, x2;
3428:   const MatScalar   *v, *vv;
3429:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3430:   PetscBool          usecprow = a->compressedrow.use;

3432:   PetscFunctionBegin;
3433:   idx = a->j;
3434:   v   = a->a;
3435:   if (usecprow) {
3436:     mbs  = a->compressedrow.nrows;
3437:     ii   = a->compressedrow.i;
3438:     ridx = a->compressedrow.rindex;
3439:   } else {
3440:     mbs = a->mbs;
3441:     ii  = a->i;
3442:     z   = c;
3443:   }

3445:   for (i = 0; i < mbs; i++) {
3446:     n = ii[1] - ii[0];
3447:     ii++;
3448:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3449:     PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3450:     if (usecprow) z = c + 2 * ridx[i];
3451:     jj = idx;
3452:     vv = v;
3453:     for (k = 0; k < cn; k++) {
3454:       idx  = jj;
3455:       v    = vv;
3456:       sum1 = 0.0;
3457:       sum2 = 0.0;
3458:       for (j = 0; j < n; j++) {
3459:         xb = b + 2 * (*idx++);
3460:         x1 = xb[0 + k * bm];
3461:         x2 = xb[1 + k * bm];
3462:         sum1 += v[0] * x1 + v[2] * x2;
3463:         sum2 += v[1] * x1 + v[3] * x2;
3464:         v += 4;
3465:       }
3466:       z[0 + k * cm] = sum1;
3467:       z[1 + k * cm] = sum2;
3468:     }
3469:     if (!usecprow) z += 2;
3470:   }
3471:   PetscFunctionReturn(PETSC_SUCCESS);
3472: }

3474: static PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3475: {
3476:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3477:   PetscScalar       *z = NULL, sum1, sum2, sum3;
3478:   const PetscScalar *xb;
3479:   PetscScalar        x1, x2, x3;
3480:   const MatScalar   *v, *vv;
3481:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3482:   PetscBool          usecprow = a->compressedrow.use;

3484:   PetscFunctionBegin;
3485:   idx = a->j;
3486:   v   = a->a;
3487:   if (usecprow) {
3488:     mbs  = a->compressedrow.nrows;
3489:     ii   = a->compressedrow.i;
3490:     ridx = a->compressedrow.rindex;
3491:   } else {
3492:     mbs = a->mbs;
3493:     ii  = a->i;
3494:     z   = c;
3495:   }

3497:   for (i = 0; i < mbs; i++) {
3498:     n = ii[1] - ii[0];
3499:     ii++;
3500:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);       /* Indices for the next row (assumes same size as this one) */
3501:     PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3502:     if (usecprow) z = c + 3 * ridx[i];
3503:     jj = idx;
3504:     vv = v;
3505:     for (k = 0; k < cn; k++) {
3506:       idx  = jj;
3507:       v    = vv;
3508:       sum1 = 0.0;
3509:       sum2 = 0.0;
3510:       sum3 = 0.0;
3511:       for (j = 0; j < n; j++) {
3512:         xb = b + 3 * (*idx++);
3513:         x1 = xb[0 + k * bm];
3514:         x2 = xb[1 + k * bm];
3515:         x3 = xb[2 + k * bm];
3516:         sum1 += v[0] * x1 + v[3] * x2 + v[6] * x3;
3517:         sum2 += v[1] * x1 + v[4] * x2 + v[7] * x3;
3518:         sum3 += v[2] * x1 + v[5] * x2 + v[8] * x3;
3519:         v += 9;
3520:       }
3521:       z[0 + k * cm] = sum1;
3522:       z[1 + k * cm] = sum2;
3523:       z[2 + k * cm] = sum3;
3524:     }
3525:     if (!usecprow) z += 3;
3526:   }
3527:   PetscFunctionReturn(PETSC_SUCCESS);
3528: }

3530: static PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3531: {
3532:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3533:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4;
3534:   const PetscScalar *xb;
3535:   PetscScalar        x1, x2, x3, x4;
3536:   const MatScalar   *v, *vv;
3537:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3538:   PetscBool          usecprow = a->compressedrow.use;

3540:   PetscFunctionBegin;
3541:   idx = a->j;
3542:   v   = a->a;
3543:   if (usecprow) {
3544:     mbs  = a->compressedrow.nrows;
3545:     ii   = a->compressedrow.i;
3546:     ridx = a->compressedrow.rindex;
3547:   } else {
3548:     mbs = a->mbs;
3549:     ii  = a->i;
3550:     z   = c;
3551:   }

3553:   for (i = 0; i < mbs; i++) {
3554:     n = ii[1] - ii[0];
3555:     ii++;
3556:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3557:     PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3558:     if (usecprow) z = c + 4 * ridx[i];
3559:     jj = idx;
3560:     vv = v;
3561:     for (k = 0; k < cn; k++) {
3562:       idx  = jj;
3563:       v    = vv;
3564:       sum1 = 0.0;
3565:       sum2 = 0.0;
3566:       sum3 = 0.0;
3567:       sum4 = 0.0;
3568:       for (j = 0; j < n; j++) {
3569:         xb = b + 4 * (*idx++);
3570:         x1 = xb[0 + k * bm];
3571:         x2 = xb[1 + k * bm];
3572:         x3 = xb[2 + k * bm];
3573:         x4 = xb[3 + k * bm];
3574:         sum1 += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
3575:         sum2 += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
3576:         sum3 += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
3577:         sum4 += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
3578:         v += 16;
3579:       }
3580:       z[0 + k * cm] = sum1;
3581:       z[1 + k * cm] = sum2;
3582:       z[2 + k * cm] = sum3;
3583:       z[3 + k * cm] = sum4;
3584:     }
3585:     if (!usecprow) z += 4;
3586:   }
3587:   PetscFunctionReturn(PETSC_SUCCESS);
3588: }

3590: static PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
3591: {
3592:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3593:   PetscScalar       *z = NULL, sum1, sum2, sum3, sum4, sum5;
3594:   const PetscScalar *xb;
3595:   PetscScalar        x1, x2, x3, x4, x5;
3596:   const MatScalar   *v, *vv;
3597:   PetscInt           mbs, i, *idx, *ii, j, *jj, n, k, *ridx = NULL;
3598:   PetscBool          usecprow = a->compressedrow.use;

3600:   PetscFunctionBegin;
3601:   idx = a->j;
3602:   v   = a->a;
3603:   if (usecprow) {
3604:     mbs  = a->compressedrow.nrows;
3605:     ii   = a->compressedrow.i;
3606:     ridx = a->compressedrow.rindex;
3607:   } else {
3608:     mbs = a->mbs;
3609:     ii  = a->i;
3610:     z   = c;
3611:   }

3613:   for (i = 0; i < mbs; i++) {
3614:     n = ii[1] - ii[0];
3615:     ii++;
3616:     PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA);         /* Indices for the next row (assumes same size as this one) */
3617:     PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3618:     if (usecprow) z = c + 5 * ridx[i];
3619:     jj = idx;
3620:     vv = v;
3621:     for (k = 0; k < cn; k++) {
3622:       idx  = jj;
3623:       v    = vv;
3624:       sum1 = 0.0;
3625:       sum2 = 0.0;
3626:       sum3 = 0.0;
3627:       sum4 = 0.0;
3628:       sum5 = 0.0;
3629:       for (j = 0; j < n; j++) {
3630:         xb = b + 5 * (*idx++);
3631:         x1 = xb[0 + k * bm];
3632:         x2 = xb[1 + k * bm];
3633:         x3 = xb[2 + k * bm];
3634:         x4 = xb[3 + k * bm];
3635:         x5 = xb[4 + k * bm];
3636:         sum1 += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
3637:         sum2 += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
3638:         sum3 += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
3639:         sum4 += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
3640:         sum5 += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
3641:         v += 25;
3642:       }
3643:       z[0 + k * cm] = sum1;
3644:       z[1 + k * cm] = sum2;
3645:       z[2 + k * cm] = sum3;
3646:       z[3 + k * cm] = sum4;
3647:       z[4 + k * cm] = sum5;
3648:     }
3649:     if (!usecprow) z += 5;
3650:   }
3651:   PetscFunctionReturn(PETSC_SUCCESS);
3652: }

3654: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A, Mat B, Mat C)
3655: {
3656:   Mat_SeqBAIJ     *a  = (Mat_SeqBAIJ *)A->data;
3657:   Mat_SeqDense    *bd = (Mat_SeqDense *)B->data;
3658:   Mat_SeqDense    *cd = (Mat_SeqDense *)C->data;
3659:   PetscInt         cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
3660:   PetscInt         mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
3661:   PetscBLASInt     bbs, bcn, bbm, bcm;
3662:   PetscScalar     *z = NULL;
3663:   PetscScalar     *c, *b;
3664:   const MatScalar *v;
3665:   const PetscInt  *idx, *ii, *ridx = NULL;
3666:   PetscScalar      _DZero = 0.0, _DOne = 1.0;
3667:   PetscBool        usecprow = a->compressedrow.use;

3669:   PetscFunctionBegin;
3670:   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
3671:   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
3672:   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
3673:   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
3674:   b = bd->v;
3675:   if (a->nonzerorowcnt != A->rmap->n) PetscCall(MatZeroEntries(C));
3676:   PetscCall(MatDenseGetArray(C, &c));
3677:   switch (bs) {
3678:   case 1:
3679:     PetscCall(MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn));
3680:     break;
3681:   case 2:
3682:     PetscCall(MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn));
3683:     break;
3684:   case 3:
3685:     PetscCall(MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn));
3686:     break;
3687:   case 4:
3688:     PetscCall(MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn));
3689:     break;
3690:   case 5:
3691:     PetscCall(MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn));
3692:     break;
3693:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3694:     PetscCall(PetscBLASIntCast(bs, &bbs));
3695:     PetscCall(PetscBLASIntCast(cn, &bcn));
3696:     PetscCall(PetscBLASIntCast(bm, &bbm));
3697:     PetscCall(PetscBLASIntCast(cm, &bcm));
3698:     idx = a->j;
3699:     v   = a->a;
3700:     if (usecprow) {
3701:       mbs  = a->compressedrow.nrows;
3702:       ii   = a->compressedrow.i;
3703:       ridx = a->compressedrow.rindex;
3704:     } else {
3705:       mbs = a->mbs;
3706:       ii  = a->i;
3707:       z   = c;
3708:     }
3709:     for (i = 0; i < mbs; i++) {
3710:       n = ii[1] - ii[0];
3711:       ii++;
3712:       if (usecprow) z = c + bs * ridx[i];
3713:       if (n) {
3714:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DZero, z, &bcm));
3715:         v += bs2;
3716:       }
3717:       for (j = 1; j < n; j++) {
3718:         PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
3719:         v += bs2;
3720:       }
3721:       if (!usecprow) z += bs;
3722:     }
3723:   }
3724:   PetscCall(MatDenseRestoreArray(C, &c));
3725:   PetscCall(PetscLogFlops((2.0 * a->nz * bs2 - bs * a->nonzerorowcnt) * cn));
3726:   PetscFunctionReturn(PETSC_SUCCESS);
3727: }