Actual source code: sell.c

  1: /*
  2:   Defines the basic matrix operations for the SELL matrix storage format.
  3: */
  4: #include <../src/mat/impls/sell/seq/sell.h>
  5: #include <petscblaslapack.h>
  6: #include <petsc/private/kernels/blocktranspose.h>

  8: static PetscBool  cited      = PETSC_FALSE;
  9: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
 10:                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
 11:                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
 12:                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
 13:                                " year = 2018\n"
 14:                                "}\n";

 16: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)

 18:   #include <immintrin.h>

 20:   #if !defined(_MM_SCALE_8)
 21:     #define _MM_SCALE_8 8
 22:   #endif

 24:   #if defined(__AVX512F__)
 25:     /* these do not work
 26:    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
 27:    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
 28:   */
 29:     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
 30:       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
 31:       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
 32:       vec_vals = _mm512_loadu_pd(aval); \
 33:       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
 34:       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
 35:   #elif defined(__AVX2__) && defined(__FMA__)
 36:     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
 37:       vec_vals = _mm256_loadu_pd(aval); \
 38:       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
 39:       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
 40:       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
 41:   #endif
 42: #endif /* PETSC_HAVE_IMMINTRIN_H */

 44: /*@
 45:   MatSeqSELLSetPreallocation - For good matrix assembly performance
 46:   the user should preallocate the matrix storage by setting the parameter `nz`
 47:   (or the array `nnz`).

 49:   Collective

 51:   Input Parameters:
 52: + B       - The `MATSEQSELL` matrix
 53: . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
 54: - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

 56:   Level: intermediate

 58:   Notes:
 59:   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
 60:   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
 61:   allocation.

 63:   You can call `MatGetInfo()` to get information on how effective the preallocation was;
 64:   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
 65:   You can also run with the option `-info` and look for messages with the string
 66:   malloc in them to see if additional memory allocation was needed.

 68:   Developer Notes:
 69:   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
 70:   entries or columns indices.

 72:   The maximum number of nonzeos in any row should be as accurate as possible.
 73:   If it is underestimated, you will get bad performance due to reallocation
 74:   (`MatSeqXSELLReallocateSELL()`).

 76: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
 77:  @*/
 78: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
 79: {
 80:   PetscFunctionBegin;
 83:   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
 84:   PetscFunctionReturn(PETSC_SUCCESS);
 85: }

 87: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
 88: {
 89:   Mat_SeqSELL *b;
 90:   PetscInt     i, j, totalslices;
 91: #if defined(PETSC_HAVE_CUPM)
 92:   PetscInt rlenmax = 0;
 93: #endif
 94:   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;

 96:   PetscFunctionBegin;
 97:   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
 98:   if (maxallocrow == MAT_SKIP_ALLOCATION) {
 99:     skipallocation = PETSC_TRUE;
100:     maxallocrow    = 0;
101:   }

103:   PetscCall(PetscLayoutSetUp(B->rmap));
104:   PetscCall(PetscLayoutSetUp(B->cmap));

106:   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107:   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108:   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109:   if (rlen) {
110:     for (i = 0; i < B->rmap->n; i++) {
111:       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112:       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113:     }
114:   }

116:   B->preallocated = PETSC_TRUE;

118:   b = (Mat_SeqSELL *)B->data;

120:   if (!b->sliceheight) { /* not set yet */
121: #if defined(PETSC_HAVE_CUPM)
122:     b->sliceheight = 16;
123: #else
124:     b->sliceheight = 8;
125: #endif
126:   }
127:   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128:   b->totalslices = totalslices;
129:   if (!skipallocation) {
130:     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));

132:     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133:       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134:     }
135:     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136:       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137:       else if (maxallocrow < 0) maxallocrow = 1;
138: #if defined(PETSC_HAVE_CUPM)
139:       rlenmax = maxallocrow;
140:       /* Pad the slice to DEVICE_MEM_ALIGN */
141:       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142: #endif
143:       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144:     } else {
145: #if defined(PETSC_HAVE_CUPM)
146:       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147: #endif
148:       maxallocrow  = 0;
149:       b->sliidx[0] = 0;
150:       for (i = 1; i < totalslices; i++) {
151:         b->sliidx[i] = 0;
152:         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153: #if defined(PETSC_HAVE_CUPM)
154:         if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155:           rlenmax      = PetscMax(b->sliidx[i], rlenmax);
156:           b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157:         }
158: #endif
159:         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160:         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161:       }
162:       /* last slice */
163:       b->sliidx[totalslices] = 0;
164:       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165: #if defined(PETSC_HAVE_CUPM)
166:       if (mul != 0) {
167:         rlenmax                = PetscMax(b->sliidx[i], rlenmax);
168:         b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169:       }
170: #endif
171:       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
172:       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173:     }

175:     /* allocate space for val, colidx, rlen */
176:     /* FIXME: should B's old memory be unlogged? */
177:     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178:     /* FIXME: assuming an element of the bit array takes 8 bits */
179:     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180:     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181:     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));

183:     b->singlemalloc = PETSC_TRUE;
184:     b->free_val     = PETSC_TRUE;
185:     b->free_colidx  = PETSC_TRUE;
186:   } else {
187:     b->free_val    = PETSC_FALSE;
188:     b->free_colidx = PETSC_FALSE;
189:   }

191:   b->nz          = 0;
192:   b->maxallocrow = maxallocrow;
193: #if defined(PETSC_HAVE_CUPM)
194:   b->rlenmax = rlenmax;
195: #else
196:   b->rlenmax = maxallocrow;
197: #endif
198:   b->maxallocmat      = b->sliidx[totalslices];
199:   B->info.nz_unneeded = (double)b->maxallocmat;
200:   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201:   PetscFunctionReturn(PETSC_SUCCESS);
202: }

204: static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205: {
206:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207:   PetscInt     shift;

209:   PetscFunctionBegin;
210:   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211:   if (nz) *nz = a->rlen[row];
212:   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213:   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
214:   if (idx) {
215:     PetscInt j;
216:     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217:     *idx = a->getrowcols;
218:   }
219:   if (v) {
220:     PetscInt j;
221:     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222:     *v = a->getrowvals;
223:   }
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228: {
229:   PetscFunctionBegin;
230:   PetscFunctionReturn(PETSC_SUCCESS);
231: }

233: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234: {
235:   Mat          B;
236:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237:   PetscInt     i;

239:   PetscFunctionBegin;
240:   if (reuse == MAT_REUSE_MATRIX) {
241:     B = *newmat;
242:     PetscCall(MatZeroEntries(B));
243:   } else {
244:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245:     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246:     PetscCall(MatSetType(B, MATSEQAIJ));
247:     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248:   }

250:   for (i = 0; i < A->rmap->n; i++) {
251:     PetscInt     nz = 0, *cols = NULL;
252:     PetscScalar *vals = NULL;

254:     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255:     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256:     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257:   }

259:   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260:   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261:   B->rmap->bs = A->rmap->bs;

263:   if (reuse == MAT_INPLACE_MATRIX) {
264:     PetscCall(MatHeaderReplace(A, &B));
265:   } else {
266:     *newmat = B;
267:   }
268:   PetscFunctionReturn(PETSC_SUCCESS);
269: }

271: #include <../src/mat/impls/aij/seq/aij.h>

273: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274: {
275:   Mat                B;
276:   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
277:   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278:   const PetscInt    *cols;
279:   const PetscScalar *vals;

281:   PetscFunctionBegin;
282:   if (reuse == MAT_REUSE_MATRIX) {
283:     B = *newmat;
284:   } else {
285:     if (PetscDefined(USE_DEBUG) || !a->ilen) {
286:       PetscCall(PetscMalloc1(m, &rowlengths));
287:       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288:     }
289:     if (PetscDefined(USE_DEBUG) && a->ilen) {
290:       PetscBool eq;
291:       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
292:       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293:       PetscCall(PetscFree(rowlengths));
294:       rowlengths = a->ilen;
295:     } else if (a->ilen) rowlengths = a->ilen;
296:     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297:     PetscCall(MatSetSizes(B, m, n, m, n));
298:     PetscCall(MatSetType(B, MATSEQSELL));
299:     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300:     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301:   }

303:   for (row = 0; row < m; row++) {
304:     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305:     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306:     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307:   }
308:   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309:   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310:   B->rmap->bs = A->rmap->bs;

312:   if (reuse == MAT_INPLACE_MATRIX) {
313:     PetscCall(MatHeaderReplace(A, &B));
314:   } else {
315:     *newmat = B;
316:   }
317:   PetscFunctionReturn(PETSC_SUCCESS);
318: }

320: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321: {
322:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
323:   PetscScalar       *y;
324:   const PetscScalar *x;
325:   const MatScalar   *aval        = a->val;
326:   PetscInt           totalslices = a->totalslices;
327:   const PetscInt    *acolidx     = a->colidx;
328:   PetscInt           i, j;
329: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330:   __m512d  vec_x, vec_y, vec_vals;
331:   __m256i  vec_idx;
332:   __mmask8 mask;
333:   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334:   __m256i  vec_idx2, vec_idx3, vec_idx4;
335: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336:   __m128i   vec_idx;
337:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
338:   MatScalar yval;
339:   PetscInt  r, rows_left, row, nnz_in_row;
340: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341:   __m128d   vec_x_tmp;
342:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
343:   MatScalar yval;
344:   PetscInt  r, rows_left, row, nnz_in_row;
345: #else
346:   PetscInt     k, sliceheight = a->sliceheight;
347:   PetscScalar *sum;
348: #endif

350: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351:   #pragma disjoint(*x, *y, *aval)
352: #endif

354:   PetscFunctionBegin;
355:   PetscCall(VecGetArrayRead(xx, &x));
356:   PetscCall(VecGetArray(yy, &y));
357: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358:   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359:   for (i = 0; i < totalslices; i++) { /* loop over slices */
360:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

363:     vec_y  = _mm512_setzero_pd();
364:     vec_y2 = _mm512_setzero_pd();
365:     vec_y3 = _mm512_setzero_pd();
366:     vec_y4 = _mm512_setzero_pd();

368:     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369:     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370:     case 3:
371:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372:       acolidx += 8;
373:       aval += 8;
374:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375:       acolidx += 8;
376:       aval += 8;
377:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378:       acolidx += 8;
379:       aval += 8;
380:       j += 3;
381:       break;
382:     case 2:
383:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384:       acolidx += 8;
385:       aval += 8;
386:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387:       acolidx += 8;
388:       aval += 8;
389:       j += 2;
390:       break;
391:     case 1:
392:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393:       acolidx += 8;
394:       aval += 8;
395:       j += 1;
396:       break;
397:     }
398:   #pragma novector
399:     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401:       acolidx += 8;
402:       aval += 8;
403:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404:       acolidx += 8;
405:       aval += 8;
406:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407:       acolidx += 8;
408:       aval += 8;
409:       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410:       acolidx += 8;
411:       aval += 8;
412:     }

414:     vec_y = _mm512_add_pd(vec_y, vec_y2);
415:     vec_y = _mm512_add_pd(vec_y, vec_y3);
416:     vec_y = _mm512_add_pd(vec_y, vec_y4);
417:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418:       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419:       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420:     } else {
421:       _mm512_storeu_pd(&y[8 * i], vec_y);
422:     }
423:   }
424: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425:   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
427:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

430:     /* last slice may have padding rows. Don't use vectorization. */
431:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432:       rows_left = A->rmap->n - 8 * i;
433:       for (r = 0; r < rows_left; ++r) {
434:         yval       = (MatScalar)0;
435:         row        = 8 * i + r;
436:         nnz_in_row = a->rlen[row];
437:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438:         y[row] = yval;
439:       }
440:       break;
441:     }

443:     vec_y  = _mm256_setzero_pd();
444:     vec_y2 = _mm256_setzero_pd();

446:   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447:   #pragma novector
448:   #pragma unroll(2)
449:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450:       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451:       aval += 4;
452:       acolidx += 4;
453:       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454:       aval += 4;
455:       acolidx += 4;
456:     }

458:     _mm256_storeu_pd(y + i * 8, vec_y);
459:     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460:   }
461: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462:   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
464:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

467:     vec_y  = _mm256_setzero_pd();
468:     vec_y2 = _mm256_setzero_pd();

470:     /* last slice may have padding rows. Don't use vectorization. */
471:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472:       rows_left = A->rmap->n - 8 * i;
473:       for (r = 0; r < rows_left; ++r) {
474:         yval       = (MatScalar)0;
475:         row        = 8 * i + r;
476:         nnz_in_row = a->rlen[row];
477:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478:         y[row] = yval;
479:       }
480:       break;
481:     }

483:   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484:   #pragma novector
485:   #pragma unroll(2)
486:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487:       vec_vals  = _mm256_loadu_pd(aval);
488:       vec_x_tmp = _mm_setzero_pd();
489:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496:       aval += 4;

498:       vec_vals  = _mm256_loadu_pd(aval);
499:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506:       aval += 4;
507:     }

509:     _mm256_storeu_pd(y + i * 8, vec_y);
510:     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511:   }
512: #else
513:   PetscCall(PetscMalloc1(sliceheight, &sum));
514:   for (i = 0; i < totalslices; i++) { /* loop over slices */
515:     for (j = 0; j < sliceheight; j++) {
516:       sum[j] = 0.0;
517:       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518:     }
519:     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520:       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521:     } else {
522:       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523:     }
524:   }
525:   PetscCall(PetscFree(sum));
526: #endif

528:   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529:   PetscCall(VecRestoreArrayRead(xx, &x));
530:   PetscCall(VecRestoreArray(yy, &y));
531:   PetscFunctionReturn(PETSC_SUCCESS);
532: }

534: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
535: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536: {
537:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
538:   PetscScalar       *y, *z;
539:   const PetscScalar *x;
540:   const MatScalar   *aval        = a->val;
541:   PetscInt           totalslices = a->totalslices;
542:   const PetscInt    *acolidx     = a->colidx;
543:   PetscInt           i, j;
544: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545:   __m512d  vec_x, vec_y, vec_vals;
546:   __m256i  vec_idx;
547:   __mmask8 mask = 0;
548:   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549:   __m256i  vec_idx2, vec_idx3, vec_idx4;
550: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551:   __m128d   vec_x_tmp;
552:   __m256d   vec_x, vec_y, vec_y2, vec_vals;
553:   MatScalar yval;
554:   PetscInt  r, row, nnz_in_row;
555: #else
556:   PetscInt     k, sliceheight = a->sliceheight;
557:   PetscScalar *sum;
558: #endif

560: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561:   #pragma disjoint(*x, *y, *aval)
562: #endif

564:   PetscFunctionBegin;
565:   if (!a->nz) {
566:     PetscCall(VecCopy(yy, zz));
567:     PetscFunctionReturn(PETSC_SUCCESS);
568:   }
569:   PetscCall(VecGetArrayRead(xx, &x));
570:   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572:   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573:   for (i = 0; i < totalslices; i++) { /* loop over slices */
574:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

577:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578:       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579:       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580:     } else {
581:       vec_y = _mm512_loadu_pd(&y[8 * i]);
582:     }
583:     vec_y2 = _mm512_setzero_pd();
584:     vec_y3 = _mm512_setzero_pd();
585:     vec_y4 = _mm512_setzero_pd();

587:     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588:     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589:     case 3:
590:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591:       acolidx += 8;
592:       aval += 8;
593:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594:       acolidx += 8;
595:       aval += 8;
596:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597:       acolidx += 8;
598:       aval += 8;
599:       j += 3;
600:       break;
601:     case 2:
602:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603:       acolidx += 8;
604:       aval += 8;
605:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606:       acolidx += 8;
607:       aval += 8;
608:       j += 2;
609:       break;
610:     case 1:
611:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612:       acolidx += 8;
613:       aval += 8;
614:       j += 1;
615:       break;
616:     }
617:   #pragma novector
618:     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619:       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620:       acolidx += 8;
621:       aval += 8;
622:       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623:       acolidx += 8;
624:       aval += 8;
625:       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626:       acolidx += 8;
627:       aval += 8;
628:       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629:       acolidx += 8;
630:       aval += 8;
631:     }

633:     vec_y = _mm512_add_pd(vec_y, vec_y2);
634:     vec_y = _mm512_add_pd(vec_y, vec_y3);
635:     vec_y = _mm512_add_pd(vec_y, vec_y4);
636:     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637:       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638:     } else {
639:       _mm512_storeu_pd(&z[8 * i], vec_y);
640:     }
641:   }
642: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643:   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644:   for (i = 0; i < totalslices; i++) { /* loop over full slices */
645:     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646:     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);

648:     /* last slice may have padding rows. Don't use vectorization. */
649:     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650:       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651:         row        = 8 * i + r;
652:         yval       = (MatScalar)0.0;
653:         nnz_in_row = a->rlen[row];
654:         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655:         z[row] = y[row] + yval;
656:       }
657:       break;
658:     }

660:     vec_y  = _mm256_loadu_pd(y + 8 * i);
661:     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);

663:     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664:     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665:       vec_vals  = _mm256_loadu_pd(aval);
666:       vec_x_tmp = _mm_setzero_pd();
667:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669:       vec_x     = _mm256_setzero_pd();
670:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674:       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675:       aval += 4;

677:       vec_vals  = _mm256_loadu_pd(aval);
678:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681:       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682:       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683:       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684:       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685:       aval += 4;
686:     }

688:     _mm256_storeu_pd(z + i * 8, vec_y);
689:     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690:   }
691: #else
692:   PetscCall(PetscMalloc1(sliceheight, &sum));
693:   for (i = 0; i < totalslices; i++) { /* loop over slices */
694:     for (j = 0; j < sliceheight; j++) {
695:       sum[j] = 0.0;
696:       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697:     }
698:     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699:       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700:     } else {
701:       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702:     }
703:   }
704:   PetscCall(PetscFree(sum));
705: #endif

707:   PetscCall(PetscLogFlops(2.0 * a->nz));
708:   PetscCall(VecRestoreArrayRead(xx, &x));
709:   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710:   PetscFunctionReturn(PETSC_SUCCESS);
711: }

713: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714: {
715:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
716:   PetscScalar       *y;
717:   const PetscScalar *x;
718:   const MatScalar   *aval    = a->val;
719:   const PetscInt    *acolidx = a->colidx;
720:   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;

722: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723:   #pragma disjoint(*x, *y, *aval)
724: #endif

726:   PetscFunctionBegin;
727:   if (A->symmetric == PETSC_BOOL3_TRUE) {
728:     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729:     PetscFunctionReturn(PETSC_SUCCESS);
730:   }
731:   if (zz != yy) PetscCall(VecCopy(zz, yy));

733:   if (a->nz) {
734:     PetscCall(VecGetArrayRead(xx, &x));
735:     PetscCall(VecGetArray(yy, &y));
736:     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737:       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738:         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739:           row        = sliceheight * i + r;
740:           nnz_in_row = a->rlen[row];
741:           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742:         }
743:         break;
744:       }
745:       for (r = 0; r < sliceheight; ++r)
746:         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747:     }
748:     PetscCall(PetscLogFlops(2.0 * a->nz));
749:     PetscCall(VecRestoreArrayRead(xx, &x));
750:     PetscCall(VecRestoreArray(yy, &y));
751:   }
752:   PetscFunctionReturn(PETSC_SUCCESS);
753: }

755: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756: {
757:   PetscFunctionBegin;
758:   if (A->symmetric == PETSC_BOOL3_TRUE) {
759:     PetscCall(MatMult_SeqSELL(A, xx, yy));
760:   } else {
761:     PetscCall(VecSet(yy, 0.0));
762:     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763:   }
764:   PetscFunctionReturn(PETSC_SUCCESS);
765: }

767: /*
768:      Checks for missing diagonals
769: */
770: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
771: {
772:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773:   PetscInt    *diag, i;

775:   PetscFunctionBegin;
776:   *missing = PETSC_FALSE;
777:   if (A->rmap->n > 0 && !a->colidx) {
778:     *missing = PETSC_TRUE;
779:     if (d) *d = 0;
780:     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
781:   } else {
782:     diag = a->diag;
783:     for (i = 0; i < A->rmap->n; i++) {
784:       if (diag[i] == -1) {
785:         *missing = PETSC_TRUE;
786:         if (d) *d = i;
787:         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
788:         break;
789:       }
790:     }
791:   }
792:   PetscFunctionReturn(PETSC_SUCCESS);
793: }

795: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
796: {
797:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
798:   PetscInt     i, j, m = A->rmap->n, shift;

800:   PetscFunctionBegin;
801:   if (!a->diag) {
802:     PetscCall(PetscMalloc1(m, &a->diag));
803:     a->free_diag = PETSC_TRUE;
804:   }
805:   for (i = 0; i < m; i++) {                                          /* loop over rows */
806:     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
807:     a->diag[i] = -1;
808:     for (j = 0; j < a->rlen[i]; j++) {
809:       if (a->colidx[shift + a->sliceheight * j] == i) {
810:         a->diag[i] = shift + a->sliceheight * j;
811:         break;
812:       }
813:     }
814:   }
815:   PetscFunctionReturn(PETSC_SUCCESS);
816: }

818: /*
819:   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
820: */
821: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
822: {
823:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
824:   PetscInt     i, *diag, m = A->rmap->n;
825:   MatScalar   *val = a->val;
826:   PetscScalar *idiag, *mdiag;

828:   PetscFunctionBegin;
829:   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
830:   PetscCall(MatMarkDiagonal_SeqSELL(A));
831:   diag = a->diag;
832:   if (!a->idiag) {
833:     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
834:     val = a->val;
835:   }
836:   mdiag = a->mdiag;
837:   idiag = a->idiag;

839:   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
840:     for (i = 0; i < m; i++) {
841:       mdiag[i] = val[diag[i]];
842:       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
843:         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
844:         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
845:         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
846:         A->factorerror_zeropivot_value = 0.0;
847:         A->factorerror_zeropivot_row   = i;
848:       }
849:       idiag[i] = 1.0 / val[diag[i]];
850:     }
851:     PetscCall(PetscLogFlops(m));
852:   } else {
853:     for (i = 0; i < m; i++) {
854:       mdiag[i] = val[diag[i]];
855:       idiag[i] = omega / (fshift + val[diag[i]]);
856:     }
857:     PetscCall(PetscLogFlops(2.0 * m));
858:   }
859:   a->idiagvalid = PETSC_TRUE;
860:   PetscFunctionReturn(PETSC_SUCCESS);
861: }

863: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
864: {
865:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

867:   PetscFunctionBegin;
868:   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
869:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
870:   PetscFunctionReturn(PETSC_SUCCESS);
871: }

873: PetscErrorCode MatDestroy_SeqSELL(Mat A)
874: {
875:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

877:   PetscFunctionBegin;
878:   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879:   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
880:   PetscCall(ISDestroy(&a->row));
881:   PetscCall(ISDestroy(&a->col));
882:   PetscCall(PetscFree(a->diag));
883:   PetscCall(PetscFree(a->rlen));
884:   PetscCall(PetscFree(a->sliidx));
885:   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
886:   PetscCall(PetscFree(a->solve_work));
887:   PetscCall(ISDestroy(&a->icol));
888:   PetscCall(PetscFree(a->saved_values));
889:   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
890:   PetscCall(PetscFree(A->data));
891: #if defined(PETSC_HAVE_CUPM)
892:   PetscCall(PetscFree(a->chunk_slice_map));
893: #endif

895:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
896:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
897:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
898:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
899:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
900:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
901:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
902: #if defined(PETSC_HAVE_CUDA)
903:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
904: #endif
905: #if defined(PETSC_HAVE_HIP)
906:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
907: #endif
908:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
909:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
910:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
911:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
912:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
913:   PetscFunctionReturn(PETSC_SUCCESS);
914: }

916: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
917: {
918:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

920:   PetscFunctionBegin;
921:   switch (op) {
922:   case MAT_ROW_ORIENTED:
923:     a->roworiented = flg;
924:     break;
925:   case MAT_KEEP_NONZERO_PATTERN:
926:     a->keepnonzeropattern = flg;
927:     break;
928:   case MAT_NEW_NONZERO_LOCATIONS:
929:     a->nonew = (flg ? 0 : 1);
930:     break;
931:   case MAT_NEW_NONZERO_LOCATION_ERR:
932:     a->nonew = (flg ? -1 : 0);
933:     break;
934:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
935:     a->nonew = (flg ? -2 : 0);
936:     break;
937:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
938:     a->nounused = (flg ? -1 : 0);
939:     break;
940:   case MAT_FORCE_DIAGONAL_ENTRIES:
941:   case MAT_IGNORE_OFF_PROC_ENTRIES:
942:   case MAT_USE_HASH_TABLE:
943:   case MAT_SORTED_FULL:
944:     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
945:     break;
946:   case MAT_SPD:
947:   case MAT_SYMMETRIC:
948:   case MAT_STRUCTURALLY_SYMMETRIC:
949:   case MAT_HERMITIAN:
950:   case MAT_SYMMETRY_ETERNAL:
951:   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
952:   case MAT_SPD_ETERNAL:
953:     /* These options are handled directly by MatSetOption() */
954:     break;
955:   default:
956:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
957:   }
958:   PetscFunctionReturn(PETSC_SUCCESS);
959: }

961: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
962: {
963:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
964:   PetscInt     i, j, n, shift;
965:   PetscScalar *x, zero = 0.0;

967:   PetscFunctionBegin;
968:   PetscCall(VecGetLocalSize(v, &n));
969:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");

971:   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
972:     PetscInt *diag = a->diag;
973:     PetscCall(VecGetArray(v, &x));
974:     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
975:     PetscCall(VecRestoreArray(v, &x));
976:     PetscFunctionReturn(PETSC_SUCCESS);
977:   }

979:   PetscCall(VecSet(v, zero));
980:   PetscCall(VecGetArray(v, &x));
981:   for (i = 0; i < n; i++) {                                     /* loop over rows */
982:     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
983:     x[i]  = 0;
984:     for (j = 0; j < a->rlen[i]; j++) {
985:       if (a->colidx[shift + a->sliceheight * j] == i) {
986:         x[i] = a->val[shift + a->sliceheight * j];
987:         break;
988:       }
989:     }
990:   }
991:   PetscCall(VecRestoreArray(v, &x));
992:   PetscFunctionReturn(PETSC_SUCCESS);
993: }

995: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
996: {
997:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
998:   const PetscScalar *l, *r;
999:   PetscInt           i, j, m, n, row;

1001:   PetscFunctionBegin;
1002:   if (ll) {
1003:     /* The local size is used so that VecMPI can be passed to this routine
1004:        by MatDiagonalScale_MPISELL */
1005:     PetscCall(VecGetLocalSize(ll, &m));
1006:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1007:     PetscCall(VecGetArrayRead(ll, &l));
1008:     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1009:       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1010:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1011:           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1012:         }
1013:       } else {
1014:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1015:       }
1016:     }
1017:     PetscCall(VecRestoreArrayRead(ll, &l));
1018:     PetscCall(PetscLogFlops(a->nz));
1019:   }
1020:   if (rr) {
1021:     PetscCall(VecGetLocalSize(rr, &n));
1022:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1023:     PetscCall(VecGetArrayRead(rr, &r));
1024:     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1025:       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1026:         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1027:           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1028:         }
1029:       } else {
1030:         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1031:       }
1032:     }
1033:     PetscCall(VecRestoreArrayRead(rr, &r));
1034:     PetscCall(PetscLogFlops(a->nz));
1035:   }
1036:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1037: #if defined(PETSC_HAVE_CUPM)
1038:   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1039: #endif
1040:   PetscFunctionReturn(PETSC_SUCCESS);
1041: }

1043: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1044: {
1045:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1046:   PetscInt    *cp, i, k, low, high, t, row, col, l;
1047:   PetscInt     shift;
1048:   MatScalar   *vp;

1050:   PetscFunctionBegin;
1051:   for (k = 0; k < m; k++) { /* loop over requested rows */
1052:     row = im[k];
1053:     if (row < 0) continue;
1054:     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1055:     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1056:     cp    = a->colidx + shift;                                        /* pointer to the row */
1057:     vp    = a->val + shift;                                           /* pointer to the row */
1058:     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1059:       col = in[l];
1060:       if (col < 0) continue;
1061:       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1062:       high = a->rlen[row];
1063:       low  = 0; /* assume unsorted */
1064:       while (high - low > 5) {
1065:         t = (low + high) / 2;
1066:         if (*(cp + a->sliceheight * t) > col) high = t;
1067:         else low = t;
1068:       }
1069:       for (i = low; i < high; i++) {
1070:         if (*(cp + a->sliceheight * i) > col) break;
1071:         if (*(cp + a->sliceheight * i) == col) {
1072:           *v++ = *(vp + a->sliceheight * i);
1073:           goto finished;
1074:         }
1075:       }
1076:       *v++ = 0.0;
1077:     finished:;
1078:     }
1079:   }
1080:   PetscFunctionReturn(PETSC_SUCCESS);
1081: }

1083: static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1084: {
1085:   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1086:   PetscInt          i, j, m = A->rmap->n, shift;
1087:   const char       *name;
1088:   PetscViewerFormat format;

1090:   PetscFunctionBegin;
1091:   PetscCall(PetscViewerGetFormat(viewer, &format));
1092:   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1093:     PetscInt nofinalvalue = 0;
1094:     /*
1095:     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1096:       nofinalvalue = 1;
1097:     }
1098:     */
1099:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1100:     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1101:     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1102: #if defined(PETSC_USE_COMPLEX)
1103:     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1104: #else
1105:     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1106: #endif
1107:     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));

1109:     for (i = 0; i < m; i++) {
1110:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1111:       for (j = 0; j < a->rlen[i]; j++) {
1112: #if defined(PETSC_USE_COMPLEX)
1113:         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1114: #else
1115:         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1116: #endif
1117:       }
1118:     }
1119:     /*
1120:     if (nofinalvalue) {
1121: #if defined(PETSC_USE_COMPLEX)
1122:       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1123: #else
1124:       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1125: #endif
1126:     }
1127:     */
1128:     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1129:     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1130:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1131:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1132:     PetscFunctionReturn(PETSC_SUCCESS);
1133:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1134:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1135:     for (i = 0; i < m; i++) {
1136:       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1137:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1138:       for (j = 0; j < a->rlen[i]; j++) {
1139: #if defined(PETSC_USE_COMPLEX)
1140:         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142:         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1144:         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1145:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1146:         }
1147: #else
1148:         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1149: #endif
1150:       }
1151:       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152:     }
1153:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154:   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1155:     PetscInt    cnt = 0, jcnt;
1156:     PetscScalar value;
1157: #if defined(PETSC_USE_COMPLEX)
1158:     PetscBool realonly = PETSC_TRUE;
1159:     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1160:       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1161:         realonly = PETSC_FALSE;
1162:         break;
1163:       }
1164:     }
1165: #endif

1167:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1168:     for (i = 0; i < m; i++) {
1169:       jcnt  = 0;
1170:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1171:       for (j = 0; j < A->cmap->n; j++) {
1172:         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1173:           value = a->val[cnt++];
1174:           jcnt++;
1175:         } else {
1176:           value = 0.0;
1177:         }
1178: #if defined(PETSC_USE_COMPLEX)
1179:         if (realonly) {
1180:           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1181:         } else {
1182:           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1183:         }
1184: #else
1185:         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1186: #endif
1187:       }
1188:       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1189:     }
1190:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1191:   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1192:     PetscInt fshift = 1;
1193:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194: #if defined(PETSC_USE_COMPLEX)
1195:     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1196: #else
1197:     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1198: #endif
1199:     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1200:     for (i = 0; i < m; i++) {
1201:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1202:       for (j = 0; j < a->rlen[i]; j++) {
1203: #if defined(PETSC_USE_COMPLEX)
1204:         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1205: #else
1206:         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1207: #endif
1208:       }
1209:     }
1210:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1211:   } else if (format == PETSC_VIEWER_NATIVE) {
1212:     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1213:       PetscInt row;
1214:       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1215:       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1216: #if defined(PETSC_USE_COMPLEX)
1217:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218:           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1219:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220:           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1221:         } else {
1222:           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1223:         }
1224: #else
1225:         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1226: #endif
1227:       }
1228:     }
1229:   } else {
1230:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1231:     if (A->factortype) {
1232:       for (i = 0; i < m; i++) {
1233:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1234:         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1235:         /* L part */
1236:         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1237: #if defined(PETSC_USE_COMPLEX)
1238:           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1239:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1240:           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1241:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1242:           } else {
1243:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1244:           }
1245: #else
1246:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1247: #endif
1248:         }
1249:         /* diagonal */
1250:         j = a->diag[i];
1251: #if defined(PETSC_USE_COMPLEX)
1252:         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1253:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1254:         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1255:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1256:         } else {
1257:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1258:         }
1259: #else
1260:         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1261: #endif

1263:         /* U part */
1264:         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1265: #if defined(PETSC_USE_COMPLEX)
1266:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1267:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1268:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1269:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1270:           } else {
1271:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1272:           }
1273: #else
1274:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1275: #endif
1276:         }
1277:         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1278:       }
1279:     } else {
1280:       for (i = 0; i < m; i++) {
1281:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1282:         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1283:         for (j = 0; j < a->rlen[i]; j++) {
1284: #if defined(PETSC_USE_COMPLEX)
1285:           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1286:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287:           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1288:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1289:           } else {
1290:             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1291:           }
1292: #else
1293:           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1294: #endif
1295:         }
1296:         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1297:       }
1298:     }
1299:     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1300:   }
1301:   PetscCall(PetscViewerFlush(viewer));
1302:   PetscFunctionReturn(PETSC_SUCCESS);
1303: }

1305: #include <petscdraw.h>
1306: static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1307: {
1308:   Mat               A = (Mat)Aa;
1309:   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1310:   PetscInt          i, j, m = A->rmap->n, shift;
1311:   int               color;
1312:   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1313:   PetscViewer       viewer;
1314:   PetscViewerFormat format;

1316:   PetscFunctionBegin;
1317:   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1318:   PetscCall(PetscViewerGetFormat(viewer, &format));
1319:   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));

1321:   /* loop over matrix elements drawing boxes */

1323:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1324:     PetscDrawCollectiveBegin(draw);
1325:     /* Blue for negative, Cyan for zero and  Red for positive */
1326:     color = PETSC_DRAW_BLUE;
1327:     for (i = 0; i < m; i++) {
1328:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1329:       y_l   = m - i - 1.0;
1330:       y_r   = y_l + 1.0;
1331:       for (j = 0; j < a->rlen[i]; j++) {
1332:         x_l = a->colidx[shift + a->sliceheight * j];
1333:         x_r = x_l + 1.0;
1334:         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1335:         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1336:       }
1337:     }
1338:     color = PETSC_DRAW_CYAN;
1339:     for (i = 0; i < m; i++) {
1340:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1341:       y_l   = m - i - 1.0;
1342:       y_r   = y_l + 1.0;
1343:       for (j = 0; j < a->rlen[i]; j++) {
1344:         x_l = a->colidx[shift + a->sliceheight * j];
1345:         x_r = x_l + 1.0;
1346:         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1347:         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1348:       }
1349:     }
1350:     color = PETSC_DRAW_RED;
1351:     for (i = 0; i < m; i++) {
1352:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1353:       y_l   = m - i - 1.0;
1354:       y_r   = y_l + 1.0;
1355:       for (j = 0; j < a->rlen[i]; j++) {
1356:         x_l = a->colidx[shift + a->sliceheight * j];
1357:         x_r = x_l + 1.0;
1358:         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1359:         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1360:       }
1361:     }
1362:     PetscDrawCollectiveEnd(draw);
1363:   } else {
1364:     /* use contour shading to indicate magnitude of values */
1365:     /* first determine max of all nonzero values */
1366:     PetscReal minv = 0.0, maxv = 0.0;
1367:     PetscInt  count = 0;
1368:     PetscDraw popup;
1369:     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1370:       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1371:     }
1372:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1373:     PetscCall(PetscDrawGetPopup(draw, &popup));
1374:     PetscCall(PetscDrawScalePopup(popup, minv, maxv));

1376:     PetscDrawCollectiveBegin(draw);
1377:     for (i = 0; i < m; i++) {
1378:       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1379:       y_l   = m - i - 1.0;
1380:       y_r   = y_l + 1.0;
1381:       for (j = 0; j < a->rlen[i]; j++) {
1382:         x_l   = a->colidx[shift + a->sliceheight * j];
1383:         x_r   = x_l + 1.0;
1384:         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1385:         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1386:         count++;
1387:       }
1388:     }
1389:     PetscDrawCollectiveEnd(draw);
1390:   }
1391:   PetscFunctionReturn(PETSC_SUCCESS);
1392: }

1394: #include <petscdraw.h>
1395: static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1396: {
1397:   PetscDraw draw;
1398:   PetscReal xr, yr, xl, yl, h, w;
1399:   PetscBool isnull;

1401:   PetscFunctionBegin;
1402:   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1403:   PetscCall(PetscDrawIsNull(draw, &isnull));
1404:   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);

1406:   xr = A->cmap->n;
1407:   yr = A->rmap->n;
1408:   h  = yr / 10.0;
1409:   w  = xr / 10.0;
1410:   xr += w;
1411:   yr += h;
1412:   xl = -w;
1413:   yl = -h;
1414:   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1415:   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1416:   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1417:   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1418:   PetscCall(PetscDrawSave(draw));
1419:   PetscFunctionReturn(PETSC_SUCCESS);
1420: }

1422: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1423: {
1424:   PetscBool iascii, isbinary, isdraw;

1426:   PetscFunctionBegin;
1427:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1428:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1429:   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1430:   if (iascii) {
1431:     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1432:   } else if (isbinary) {
1433:     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1434:   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1435:   PetscFunctionReturn(PETSC_SUCCESS);
1436: }

1438: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1439: {
1440:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1441:   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1442:   MatScalar   *vp;
1443: #if defined(PETSC_HAVE_CUPM)
1444:   PetscInt totalchunks = 0;
1445: #endif

1447:   PetscFunctionBegin;
1448:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1449:   /* To do: compress out the unused elements */
1450:   PetscCall(MatMarkDiagonal_SeqSELL(A));
1451:   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1452:   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1453:   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1454:   a->nonzerorowcnt = 0;
1455:   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1456:   for (i = 0; i < a->totalslices; ++i) {
1457:     shift = a->sliidx[i];                                                   /* starting index of the slice */
1458:     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1459:     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1460:     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1461:       row  = a->sliceheight * i + row_in_slice;
1462:       nrow = a->rlen[row]; /* number of nonzeros in row */
1463:       /*
1464:         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1465:         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1466:       */
1467:       lastcol = 0;
1468:       if (nrow > 0) { /* nonempty row */
1469:         a->nonzerorowcnt++;
1470:         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1471:       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1472:         for (j = 1; j < a->sliceheight; j++) {
1473:           if (a->rlen[a->sliceheight * i + j]) {
1474:             lastcol = cp[j];
1475:             break;
1476:           }
1477:         }
1478:       } else {
1479:         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1480:       }

1482:       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1483:         cp[a->sliceheight * k + row_in_slice] = lastcol;
1484:         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1485:       }
1486:     }
1487:   }

1489:   A->info.mallocs += a->reallocs;
1490:   a->reallocs = 0;

1492:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1493: #if defined(PETSC_HAVE_CUPM)
1494:   if (!a->chunksize && a->totalslices) {
1495:     a->chunksize = 64;
1496:     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1497:     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1498:   }
1499:   if (totalchunks != a->totalchunks) {
1500:     PetscCall(PetscFree(a->chunk_slice_map));
1501:     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1502:     a->totalchunks = totalchunks;
1503:   }
1504:   j = 0;
1505:   for (i = 0; i < totalchunks; i++) {
1506:     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1507:     a->chunk_slice_map[i] = j;
1508:   }
1509: #endif
1510:   PetscFunctionReturn(PETSC_SUCCESS);
1511: }

1513: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1514: {
1515:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

1517:   PetscFunctionBegin;
1518:   info->block_size   = 1.0;
1519:   info->nz_allocated = a->maxallocmat;
1520:   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1521:   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1522:   info->assemblies   = A->num_ass;
1523:   info->mallocs      = A->info.mallocs;
1524:   info->memory       = 0; /* REVIEW ME */
1525:   if (A->factortype) {
1526:     info->fill_ratio_given  = A->info.fill_ratio_given;
1527:     info->fill_ratio_needed = A->info.fill_ratio_needed;
1528:     info->factor_mallocs    = A->info.factor_mallocs;
1529:   } else {
1530:     info->fill_ratio_given  = 0;
1531:     info->fill_ratio_needed = 0;
1532:     info->factor_mallocs    = 0;
1533:   }
1534:   PetscFunctionReturn(PETSC_SUCCESS);
1535: }

1537: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1538: {
1539:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1540:   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1541:   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1542:   MatScalar   *vp, value;
1543: #if defined(PETSC_HAVE_CUPM)
1544:   PetscBool inserted = PETSC_FALSE;
1545:   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1546: #endif

1548:   PetscFunctionBegin;
1549:   for (k = 0; k < m; k++) { /* loop over added rows */
1550:     row = im[k];
1551:     if (row < 0) continue;
1552:     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1553:     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1554:     cp    = a->colidx + shift;                                      /* pointer to the row */
1555:     vp    = a->val + shift;                                         /* pointer to the row */
1556:     nrow  = a->rlen[row];
1557:     low   = 0;
1558:     high  = nrow;

1560:     for (l = 0; l < n; l++) { /* loop over added columns */
1561:       col = in[l];
1562:       if (col < 0) continue;
1563:       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1564:       if (a->roworiented) {
1565:         value = v[l + k * n];
1566:       } else {
1567:         value = v[k + l * m];
1568:       }
1569:       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;

1571:       /* search in this row for the specified column, i indicates the column to be set */
1572:       if (col <= lastcol) low = 0;
1573:       else high = nrow;
1574:       lastcol = col;
1575:       while (high - low > 5) {
1576:         t = (low + high) / 2;
1577:         if (*(cp + a->sliceheight * t) > col) high = t;
1578:         else low = t;
1579:       }
1580:       for (i = low; i < high; i++) {
1581:         if (*(cp + a->sliceheight * i) > col) break;
1582:         if (*(cp + a->sliceheight * i) == col) {
1583:           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1584:           else *(vp + a->sliceheight * i) = value;
1585: #if defined(PETSC_HAVE_CUPM)
1586:           inserted = PETSC_TRUE;
1587: #endif
1588:           low = i + 1;
1589:           goto noinsert;
1590:         }
1591:       }
1592:       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1593:       if (nonew == 1) goto noinsert;
1594:       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1595: #if defined(PETSC_HAVE_CUPM)
1596:       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1597: #else
1598:       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1599:       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1600: #endif
1601:       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1602:       for (ii = nrow - 1; ii >= i; ii--) {
1603:         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1604:         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1605:       }
1606:       a->rlen[row]++;
1607:       *(cp + a->sliceheight * i) = col;
1608:       *(vp + a->sliceheight * i) = value;
1609:       a->nz++;
1610: #if defined(PETSC_HAVE_CUPM)
1611:       inserted = PETSC_TRUE;
1612: #endif
1613:       low = i + 1;
1614:       high++;
1615:       nrow++;
1616:     noinsert:;
1617:     }
1618:     a->rlen[row] = nrow;
1619:   }
1620: #if defined(PETSC_HAVE_CUPM)
1621:   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1622: #endif
1623:   PetscFunctionReturn(PETSC_SUCCESS);
1624: }

1626: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1627: {
1628:   PetscFunctionBegin;
1629:   /* If the two matrices have the same copy implementation, use fast copy. */
1630:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1631:     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1632:     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;

1634:     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1635:     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1636:   } else {
1637:     PetscCall(MatCopy_Basic(A, B, str));
1638:   }
1639:   PetscFunctionReturn(PETSC_SUCCESS);
1640: }

1642: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1643: {
1644:   PetscFunctionBegin;
1645:   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1646:   PetscFunctionReturn(PETSC_SUCCESS);
1647: }

1649: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1650: {
1651:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

1653:   PetscFunctionBegin;
1654:   *array = a->val;
1655:   PetscFunctionReturn(PETSC_SUCCESS);
1656: }

1658: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1659: {
1660:   PetscFunctionBegin;
1661:   PetscFunctionReturn(PETSC_SUCCESS);
1662: }

1664: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1665: {
1666:   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1667:   MatScalar   *aval   = a->val;
1668:   PetscScalar  oalpha = alpha;
1669:   PetscBLASInt one    = 1, size;

1671:   PetscFunctionBegin;
1672:   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1673:   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1674:   PetscCall(PetscLogFlops(a->nz));
1675:   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1676: #if defined(PETSC_HAVE_CUPM)
1677:   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1678: #endif
1679:   PetscFunctionReturn(PETSC_SUCCESS);
1680: }

1682: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1683: {
1684:   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;

1686:   PetscFunctionBegin;
1687:   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1688:   PetscCall(MatShift_Basic(Y, a));
1689:   PetscFunctionReturn(PETSC_SUCCESS);
1690: }

1692: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1693: {
1694:   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1695:   PetscScalar       *x, sum, *t;
1696:   const MatScalar   *idiag = NULL, *mdiag;
1697:   const PetscScalar *b, *xb;
1698:   PetscInt           n, m = A->rmap->n, i, j, shift;
1699:   const PetscInt    *diag;

1701:   PetscFunctionBegin;
1702:   its = its * lits;

1704:   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1705:   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1706:   a->fshift = fshift;
1707:   a->omega  = omega;

1709:   diag  = a->diag;
1710:   t     = a->ssor_work;
1711:   idiag = a->idiag;
1712:   mdiag = a->mdiag;

1714:   PetscCall(VecGetArray(xx, &x));
1715:   PetscCall(VecGetArrayRead(bb, &b));
1716:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1717:   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1718:   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1719:   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");

1721:   if (flag & SOR_ZERO_INITIAL_GUESS) {
1722:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1723:       for (i = 0; i < m; i++) {
1724:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1725:         sum   = b[i];
1726:         n     = (diag[i] - shift) / a->sliceheight;
1727:         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1728:         t[i] = sum;
1729:         x[i] = sum * idiag[i];
1730:       }
1731:       xb = t;
1732:       PetscCall(PetscLogFlops(a->nz));
1733:     } else xb = b;
1734:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1735:       for (i = m - 1; i >= 0; i--) {
1736:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1737:         sum   = xb[i];
1738:         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1739:         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1740:         if (xb == b) {
1741:           x[i] = sum * idiag[i];
1742:         } else {
1743:           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1744:         }
1745:       }
1746:       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1747:     }
1748:     its--;
1749:   }
1750:   while (its--) {
1751:     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1752:       for (i = 0; i < m; i++) {
1753:         /* lower */
1754:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1755:         sum   = b[i];
1756:         n     = (diag[i] - shift) / a->sliceheight;
1757:         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1758:         t[i] = sum; /* save application of the lower-triangular part */
1759:         /* upper */
1760:         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1761:         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1762:         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1763:       }
1764:       xb = t;
1765:       PetscCall(PetscLogFlops(2.0 * a->nz));
1766:     } else xb = b;
1767:     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1768:       for (i = m - 1; i >= 0; i--) {
1769:         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1770:         sum   = xb[i];
1771:         if (xb == b) {
1772:           /* whole matrix (no checkpointing available) */
1773:           n = a->rlen[i];
1774:           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1775:           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1776:         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1777:           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1778:           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1779:           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1780:         }
1781:       }
1782:       if (xb == b) {
1783:         PetscCall(PetscLogFlops(2.0 * a->nz));
1784:       } else {
1785:         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1786:       }
1787:     }
1788:   }
1789:   PetscCall(VecRestoreArray(xx, &x));
1790:   PetscCall(VecRestoreArrayRead(bb, &b));
1791:   PetscFunctionReturn(PETSC_SUCCESS);
1792: }

1794: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1795:                                        MatGetRow_SeqSELL,
1796:                                        MatRestoreRow_SeqSELL,
1797:                                        MatMult_SeqSELL,
1798:                                        /* 4*/ MatMultAdd_SeqSELL,
1799:                                        MatMultTranspose_SeqSELL,
1800:                                        MatMultTransposeAdd_SeqSELL,
1801:                                        NULL,
1802:                                        NULL,
1803:                                        NULL,
1804:                                        /* 10*/ NULL,
1805:                                        NULL,
1806:                                        NULL,
1807:                                        MatSOR_SeqSELL,
1808:                                        NULL,
1809:                                        /* 15*/ MatGetInfo_SeqSELL,
1810:                                        MatEqual_SeqSELL,
1811:                                        MatGetDiagonal_SeqSELL,
1812:                                        MatDiagonalScale_SeqSELL,
1813:                                        NULL,
1814:                                        /* 20*/ NULL,
1815:                                        MatAssemblyEnd_SeqSELL,
1816:                                        MatSetOption_SeqSELL,
1817:                                        MatZeroEntries_SeqSELL,
1818:                                        /* 24*/ NULL,
1819:                                        NULL,
1820:                                        NULL,
1821:                                        NULL,
1822:                                        NULL,
1823:                                        /* 29*/ MatSetUp_SeqSELL,
1824:                                        NULL,
1825:                                        NULL,
1826:                                        NULL,
1827:                                        NULL,
1828:                                        /* 34*/ MatDuplicate_SeqSELL,
1829:                                        NULL,
1830:                                        NULL,
1831:                                        NULL,
1832:                                        NULL,
1833:                                        /* 39*/ NULL,
1834:                                        NULL,
1835:                                        NULL,
1836:                                        MatGetValues_SeqSELL,
1837:                                        MatCopy_SeqSELL,
1838:                                        /* 44*/ NULL,
1839:                                        MatScale_SeqSELL,
1840:                                        MatShift_SeqSELL,
1841:                                        NULL,
1842:                                        NULL,
1843:                                        /* 49*/ NULL,
1844:                                        NULL,
1845:                                        NULL,
1846:                                        NULL,
1847:                                        NULL,
1848:                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1849:                                        NULL,
1850:                                        NULL,
1851:                                        NULL,
1852:                                        NULL,
1853:                                        /* 59*/ NULL,
1854:                                        MatDestroy_SeqSELL,
1855:                                        MatView_SeqSELL,
1856:                                        NULL,
1857:                                        NULL,
1858:                                        /* 64*/ NULL,
1859:                                        NULL,
1860:                                        NULL,
1861:                                        NULL,
1862:                                        NULL,
1863:                                        /* 69*/ NULL,
1864:                                        NULL,
1865:                                        NULL,
1866:                                        NULL,
1867:                                        NULL,
1868:                                        /* 74*/ NULL,
1869:                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1870:                                        NULL,
1871:                                        NULL,
1872:                                        NULL,
1873:                                        /* 79*/ NULL,
1874:                                        NULL,
1875:                                        NULL,
1876:                                        NULL,
1877:                                        NULL,
1878:                                        /* 84*/ NULL,
1879:                                        NULL,
1880:                                        NULL,
1881:                                        NULL,
1882:                                        NULL,
1883:                                        /* 89*/ NULL,
1884:                                        NULL,
1885:                                        NULL,
1886:                                        NULL,
1887:                                        NULL,
1888:                                        /* 94*/ NULL,
1889:                                        NULL,
1890:                                        NULL,
1891:                                        NULL,
1892:                                        NULL,
1893:                                        /* 99*/ NULL,
1894:                                        NULL,
1895:                                        NULL,
1896:                                        MatConjugate_SeqSELL,
1897:                                        NULL,
1898:                                        /*104*/ NULL,
1899:                                        NULL,
1900:                                        NULL,
1901:                                        NULL,
1902:                                        NULL,
1903:                                        /*109*/ NULL,
1904:                                        NULL,
1905:                                        NULL,
1906:                                        NULL,
1907:                                        MatMissingDiagonal_SeqSELL,
1908:                                        /*114*/ NULL,
1909:                                        NULL,
1910:                                        NULL,
1911:                                        NULL,
1912:                                        NULL,
1913:                                        /*119*/ NULL,
1914:                                        NULL,
1915:                                        NULL,
1916:                                        NULL,
1917:                                        NULL,
1918:                                        /*124*/ NULL,
1919:                                        NULL,
1920:                                        NULL,
1921:                                        NULL,
1922:                                        NULL,
1923:                                        /*129*/ NULL,
1924:                                        NULL,
1925:                                        NULL,
1926:                                        NULL,
1927:                                        NULL,
1928:                                        /*134*/ NULL,
1929:                                        NULL,
1930:                                        NULL,
1931:                                        NULL,
1932:                                        NULL,
1933:                                        /*139*/ NULL,
1934:                                        NULL,
1935:                                        NULL,
1936:                                        MatFDColoringSetUp_SeqXAIJ,
1937:                                        NULL,
1938:                                        /*144*/ NULL,
1939:                                        NULL,
1940:                                        NULL,
1941:                                        NULL,
1942:                                        NULL,
1943:                                        NULL,
1944:                                        /*150*/ NULL,
1945:                                        NULL,
1946:                                        NULL,
1947:                                        NULL,
1948:                                        NULL,
1949:                                        /*155*/ NULL,
1950:                                        NULL};

1952: static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1953: {
1954:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;

1956:   PetscFunctionBegin;
1957:   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");

1959:   /* allocate space for values if not already there */
1960:   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));

1962:   /* copy values over */
1963:   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1964:   PetscFunctionReturn(PETSC_SUCCESS);
1965: }

1967: static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1968: {
1969:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;

1971:   PetscFunctionBegin;
1972:   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1973:   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1974:   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1975:   PetscFunctionReturn(PETSC_SUCCESS);
1976: }

1978: static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1979: {
1980:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;

1982:   PetscFunctionBegin;
1983:   if (a->totalslices && a->sliidx[a->totalslices]) {
1984:     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1985:   } else {
1986:     *ratio = 0.0;
1987:   }
1988:   PetscFunctionReturn(PETSC_SUCCESS);
1989: }

1991: static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1992: {
1993:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1994:   PetscInt     i, current_slicewidth;

1996:   PetscFunctionBegin;
1997:   *slicewidth = 0;
1998:   for (i = 0; i < a->totalslices; i++) {
1999:     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2000:     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2001:   }
2002:   PetscFunctionReturn(PETSC_SUCCESS);
2003: }

2005: static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2006: {
2007:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;

2009:   PetscFunctionBegin;
2010:   *slicewidth = 0;
2011:   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2012:   PetscFunctionReturn(PETSC_SUCCESS);
2013: }

2015: static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2016: {
2017:   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2018:   PetscReal    mean;
2019:   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;

2021:   PetscFunctionBegin;
2022:   *variance = 0;
2023:   if (totalslices) {
2024:     mean = (PetscReal)sliidx[totalslices] / totalslices;
2025:     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2026:   }
2027:   PetscFunctionReturn(PETSC_SUCCESS);
2028: }

2030: static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2031: {
2032:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

2034:   PetscFunctionBegin;
2035:   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2036:   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2037:   a->sliceheight = sliceheight;
2038: #if defined(PETSC_HAVE_CUPM)
2039:   PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2040: #endif
2041:   PetscFunctionReturn(PETSC_SUCCESS);
2042: }

2044: /*@
2045:   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.

2047:   Not Collective

2049:   Input Parameter:
2050: . A - a MATSEQSELL matrix

2052:   Output Parameter:
2053: . ratio - ratio of number of padded zeros to number of allocated elements

2055:   Level: intermediate

2057: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2058: @*/
2059: PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2060: {
2061:   PetscFunctionBegin;
2062:   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2063:   PetscFunctionReturn(PETSC_SUCCESS);
2064: }

2066: /*@
2067:   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.

2069:   Not Collective

2071:   Input Parameter:
2072: . A - a MATSEQSELL matrix

2074:   Output Parameter:
2075: . slicewidth - maximum slice width

2077:   Level: intermediate

2079: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2080: @*/
2081: PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2082: {
2083:   PetscFunctionBegin;
2084:   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2085:   PetscFunctionReturn(PETSC_SUCCESS);
2086: }

2088: /*@
2089:   MatSeqSELLGetAvgSliceWidth - returns the average slice width.

2091:   Not Collective

2093:   Input Parameter:
2094: . A - a MATSEQSELL matrix

2096:   Output Parameter:
2097: . slicewidth - average slice width

2099:   Level: intermediate

2101: .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2102: @*/
2103: PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2104: {
2105:   PetscFunctionBegin;
2106:   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2107:   PetscFunctionReturn(PETSC_SUCCESS);
2108: }

2110: /*@
2111:   MatSeqSELLSetSliceHeight - sets the slice height.

2113:   Not Collective

2115:   Input Parameters:
2116: + A           - a MATSEQSELL matrix
2117: - sliceheight - slice height

2119:   Notes:
2120:   You cannot change the slice height once it have been set.

2122:   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.

2124:   Level: intermediate

2126: .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2127: @*/
2128: PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2129: {
2130:   PetscFunctionBegin;
2131:   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2132:   PetscFunctionReturn(PETSC_SUCCESS);
2133: }

2135: /*@
2136:   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.

2138:   Not Collective

2140:   Input Parameter:
2141: . A - a MATSEQSELL matrix

2143:   Output Parameter:
2144: . variance - variance of the slice size

2146:   Level: intermediate

2148: .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2149: @*/
2150: PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2151: {
2152:   PetscFunctionBegin;
2153:   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2154:   PetscFunctionReturn(PETSC_SUCCESS);
2155: }

2157: #if defined(PETSC_HAVE_CUDA)
2158: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2159: #endif
2160: #if defined(PETSC_HAVE_HIP)
2161: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2162: #endif

2164: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2165: {
2166:   Mat_SeqSELL *b;
2167:   PetscMPIInt  size;

2169:   PetscFunctionBegin;
2170:   PetscCall(PetscCitationsRegister(citation, &cited));
2171:   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2172:   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");

2174:   PetscCall(PetscNew(&b));

2176:   B->data   = (void *)b;
2177:   B->ops[0] = MatOps_Values;

2179:   b->row                = NULL;
2180:   b->col                = NULL;
2181:   b->icol               = NULL;
2182:   b->reallocs           = 0;
2183:   b->ignorezeroentries  = PETSC_FALSE;
2184:   b->roworiented        = PETSC_TRUE;
2185:   b->nonew              = 0;
2186:   b->diag               = NULL;
2187:   b->solve_work         = NULL;
2188:   B->spptr              = NULL;
2189:   b->saved_values       = NULL;
2190:   b->idiag              = NULL;
2191:   b->mdiag              = NULL;
2192:   b->ssor_work          = NULL;
2193:   b->omega              = 1.0;
2194:   b->fshift             = 0.0;
2195:   b->idiagvalid         = PETSC_FALSE;
2196:   b->keepnonzeropattern = PETSC_FALSE;
2197:   b->sliceheight        = 0;

2199:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2200:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2201:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2202:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2203:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2204:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2205:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2206: #if defined(PETSC_HAVE_CUDA)
2207:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2208: #endif
2209: #if defined(PETSC_HAVE_HIP)
2210:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2211: #endif
2212:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2213:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2214:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2215:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2216:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));

2218:   PetscObjectOptionsBegin((PetscObject)B);
2219:   {
2220:     PetscInt  newsh = -1;
2221:     PetscBool flg;
2222: #if defined(PETSC_HAVE_CUPM)
2223:     PetscInt chunksize = 0;
2224: #endif

2226:     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2227:     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2228: #if defined(PETSC_HAVE_CUPM)
2229:     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2230:     if (flg) {
2231:       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2232:       b->chunksize = chunksize;
2233:     }
2234: #endif
2235:   }
2236:   PetscOptionsEnd();
2237:   PetscFunctionReturn(PETSC_SUCCESS);
2238: }

2240: /*
2241:  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2242:  */
2243: static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2244: {
2245:   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2246:   PetscInt     i, m                           = A->rmap->n;
2247:   PetscInt     totalslices = a->totalslices;

2249:   PetscFunctionBegin;
2250:   C->factortype = A->factortype;
2251:   c->row        = NULL;
2252:   c->col        = NULL;
2253:   c->icol       = NULL;
2254:   c->reallocs   = 0;
2255:   C->assembled  = PETSC_TRUE;

2257:   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2258:   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));

2260:   c->sliceheight = a->sliceheight;
2261:   PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2262:   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));

2264:   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2265:   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];

2267:   /* allocate the matrix space */
2268:   if (mallocmatspace) {
2269:     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));

2271:     c->singlemalloc = PETSC_TRUE;

2273:     if (m > 0) {
2274:       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2275:       if (cpvalues == MAT_COPY_VALUES) {
2276:         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2277:       } else {
2278:         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2279:       }
2280:     }
2281:   }

2283:   c->ignorezeroentries = a->ignorezeroentries;
2284:   c->roworiented       = a->roworiented;
2285:   c->nonew             = a->nonew;
2286:   if (a->diag) {
2287:     PetscCall(PetscMalloc1(m, &c->diag));
2288:     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2289:   } else c->diag = NULL;

2291:   c->solve_work         = NULL;
2292:   c->saved_values       = NULL;
2293:   c->idiag              = NULL;
2294:   c->ssor_work          = NULL;
2295:   c->keepnonzeropattern = a->keepnonzeropattern;
2296:   c->free_val           = PETSC_TRUE;
2297:   c->free_colidx        = PETSC_TRUE;

2299:   c->maxallocmat  = a->maxallocmat;
2300:   c->maxallocrow  = a->maxallocrow;
2301:   c->rlenmax      = a->rlenmax;
2302:   c->nz           = a->nz;
2303:   C->preallocated = PETSC_TRUE;

2305:   c->nonzerorowcnt = a->nonzerorowcnt;
2306:   C->nonzerostate  = A->nonzerostate;

2308:   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2309:   PetscFunctionReturn(PETSC_SUCCESS);
2310: }

2312: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2313: {
2314:   PetscFunctionBegin;
2315:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2316:   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2317:   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318:   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2319:   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2320:   PetscFunctionReturn(PETSC_SUCCESS);
2321: }

2323: /*MC
2324:    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2325:    based on the sliced Ellpack format, {cite}`zhangellpack2018`

2327:    Options Database Key:
2328: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`

2330:    Level: beginner

2332: .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2333: M*/

2335: /*MC
2336:    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`

2338:    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2339:    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2340:   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2341:   for communicators controlling multiple processes.  It is recommended that you call both of
2342:   the above preallocation routines for simplicity.

2344:    Options Database Key:
2345: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()

2347:   Level: beginner

2349:   Notes:
2350:   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).

2352:   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2353:   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.

2355:   Developer Notes:
2356:   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.

2358:   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2359: .vb
2360:                             (2 0  3 4)
2361:    Consider the matrix A =  (5 0  6 0)
2362:                             (0 0  7 8)
2363:                             (0 0  9 9)

2365:    symbolically the Ellpack format can be written as

2367:         (2 3 4 |)           (0 2 3 |)
2368:    v =  (5 6 0 |)  colidx = (0 2 2 |)
2369:         --------            ---------
2370:         (7 8 |)             (2 3 |)
2371:         (9 9 |)             (2 3 |)

2373:     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2374:     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2375:     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.

2377:     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)

2379: .ve

2381:     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.

2383: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2384: M*/

2386: /*@
2387:   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.

2389:   Collective

2391:   Input Parameters:
2392: + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2393: . m       - number of rows
2394: . n       - number of columns
2395: . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2396: - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL

2398:   Output Parameter:
2399: . A - the matrix

2401:   Level: intermediate

2403:   Notes:
2404:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2405:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2406:   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]

2408:   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2409:   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2410:   allocation.

2412: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2413:  @*/
2414: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2415: {
2416:   PetscFunctionBegin;
2417:   PetscCall(MatCreate(comm, A));
2418:   PetscCall(MatSetSizes(*A, m, n, m, n));
2419:   PetscCall(MatSetType(*A, MATSEQSELL));
2420:   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2421:   PetscFunctionReturn(PETSC_SUCCESS);
2422: }

2424: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2425: {
2426:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2427:   PetscInt     totalslices = a->totalslices;

2429:   PetscFunctionBegin;
2430:   /* If the  matrix dimensions are not equal,or no of nonzeros */
2431:   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2432:     *flg = PETSC_FALSE;
2433:     PetscFunctionReturn(PETSC_SUCCESS);
2434:   }
2435:   /* if the a->colidx are the same */
2436:   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2437:   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2438:   /* if a->val are the same */
2439:   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2440:   PetscFunctionReturn(PETSC_SUCCESS);
2441: }

2443: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2444: {
2445:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

2447:   PetscFunctionBegin;
2448:   a->idiagvalid = PETSC_FALSE;
2449:   PetscFunctionReturn(PETSC_SUCCESS);
2450: }

2452: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2453: {
2454: #if defined(PETSC_USE_COMPLEX)
2455:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2456:   PetscInt     i;
2457:   PetscScalar *val = a->val;

2459:   PetscFunctionBegin;
2460:   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2461:   #if defined(PETSC_HAVE_CUPM)
2462:   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2463:   #endif
2464: #else
2465:   PetscFunctionBegin;
2466: #endif
2467:   PetscFunctionReturn(PETSC_SUCCESS);
2468: }