Actual source code: sell.c
1: /*
2: Defines the basic matrix operations for the SELL matrix storage format.
3: */
4: #include <../src/mat/impls/sell/seq/sell.h>
5: #include <petscblaslapack.h>
6: #include <petsc/private/kernels/blocktranspose.h>
8: static PetscBool cited = PETSC_FALSE;
9: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13: " year = 2018\n"
14: "}\n";
16: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18: #include <immintrin.h>
20: #if !defined(_MM_SCALE_8)
21: #define _MM_SCALE_8 8
22: #endif
24: #if defined(__AVX512F__)
25: /* these do not work
26: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28: */
29: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
32: vec_vals = _mm512_loadu_pd(aval); \
33: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35: #elif defined(__AVX2__) && defined(__FMA__)
36: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37: vec_vals = _mm256_loadu_pd(aval); \
38: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41: #endif
42: #endif /* PETSC_HAVE_IMMINTRIN_H */
44: /*@
45: MatSeqSELLSetPreallocation - For good matrix assembly performance
46: the user should preallocate the matrix storage by setting the parameter `nz`
47: (or the array `nnz`).
49: Collective
51: Input Parameters:
52: + B - The `MATSEQSELL` matrix
53: . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56: Level: intermediate
58: Notes:
59: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61: allocation.
63: You can call `MatGetInfo()` to get information on how effective the preallocation was;
64: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65: You can also run with the option `-info` and look for messages with the string
66: malloc in them to see if additional memory allocation was needed.
68: Developer Notes:
69: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70: entries or columns indices.
72: The maximum number of nonzeos in any row should be as accurate as possible.
73: If it is underestimated, you will get bad performance due to reallocation
74: (`MatSeqXSELLReallocateSELL()`).
76: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77: @*/
78: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79: {
80: PetscFunctionBegin;
83: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88: {
89: Mat_SeqSELL *b;
90: PetscInt i, j, totalslices;
91: #if defined(PETSC_HAVE_CUPM)
92: PetscInt rlenmax = 0;
93: #endif
94: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96: PetscFunctionBegin;
97: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98: if (maxallocrow == MAT_SKIP_ALLOCATION) {
99: skipallocation = PETSC_TRUE;
100: maxallocrow = 0;
101: }
103: PetscCall(PetscLayoutSetUp(B->rmap));
104: PetscCall(PetscLayoutSetUp(B->cmap));
106: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108: PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109: if (rlen) {
110: for (i = 0; i < B->rmap->n; i++) {
111: PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112: PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113: }
114: }
116: B->preallocated = PETSC_TRUE;
118: b = (Mat_SeqSELL *)B->data;
120: if (!b->sliceheight) { /* not set yet */
121: #if defined(PETSC_HAVE_CUPM)
122: b->sliceheight = 16;
123: #else
124: b->sliceheight = 8;
125: #endif
126: }
127: totalslices = PetscCeilInt(B->rmap->n, b->sliceheight);
128: b->totalslices = totalslices;
129: if (!skipallocation) {
130: if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133: PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134: }
135: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137: else if (maxallocrow < 0) maxallocrow = 1;
138: #if defined(PETSC_HAVE_CUPM)
139: rlenmax = maxallocrow;
140: /* Pad the slice to DEVICE_MEM_ALIGN */
141: while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142: #endif
143: for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144: } else {
145: #if defined(PETSC_HAVE_CUPM)
146: PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147: #endif
148: maxallocrow = 0;
149: b->sliidx[0] = 0;
150: for (i = 1; i < totalslices; i++) {
151: b->sliidx[i] = 0;
152: for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153: #if defined(PETSC_HAVE_CUPM)
154: if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155: rlenmax = PetscMax(b->sliidx[i], rlenmax);
156: b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157: }
158: #endif
159: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160: PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161: }
162: /* last slice */
163: b->sliidx[totalslices] = 0;
164: for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165: #if defined(PETSC_HAVE_CUPM)
166: if (mul != 0) {
167: rlenmax = PetscMax(b->sliidx[i], rlenmax);
168: b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169: }
170: #endif
171: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
172: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173: }
175: /* allocate space for val, colidx, rlen */
176: /* FIXME: should B's old memory be unlogged? */
177: PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178: /* FIXME: assuming an element of the bit array takes 8 bits */
179: PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181: PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
183: b->singlemalloc = PETSC_TRUE;
184: b->free_val = PETSC_TRUE;
185: b->free_colidx = PETSC_TRUE;
186: } else {
187: b->free_val = PETSC_FALSE;
188: b->free_colidx = PETSC_FALSE;
189: }
191: b->nz = 0;
192: b->maxallocrow = maxallocrow;
193: #if defined(PETSC_HAVE_CUPM)
194: b->rlenmax = rlenmax;
195: #else
196: b->rlenmax = maxallocrow;
197: #endif
198: b->maxallocmat = b->sliidx[totalslices];
199: B->info.nz_unneeded = (double)b->maxallocmat;
200: if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201: PetscFunctionReturn(PETSC_SUCCESS);
202: }
204: static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205: {
206: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207: PetscInt shift;
209: PetscFunctionBegin;
210: PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211: if (nz) *nz = a->rlen[row];
212: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213: if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
214: if (idx) {
215: PetscInt j;
216: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217: *idx = a->getrowcols;
218: }
219: if (v) {
220: PetscInt j;
221: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222: *v = a->getrowvals;
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228: {
229: PetscFunctionBegin;
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234: {
235: Mat B;
236: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237: PetscInt i;
239: PetscFunctionBegin;
240: if (reuse == MAT_REUSE_MATRIX) {
241: B = *newmat;
242: PetscCall(MatZeroEntries(B));
243: } else {
244: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245: PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246: PetscCall(MatSetType(B, MATSEQAIJ));
247: PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248: }
250: for (i = 0; i < A->rmap->n; i++) {
251: PetscInt nz = 0, *cols = NULL;
252: PetscScalar *vals = NULL;
254: PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255: PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256: PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257: }
259: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261: B->rmap->bs = A->rmap->bs;
263: if (reuse == MAT_INPLACE_MATRIX) {
264: PetscCall(MatHeaderReplace(A, &B));
265: } else {
266: *newmat = B;
267: }
268: PetscFunctionReturn(PETSC_SUCCESS);
269: }
271: #include <../src/mat/impls/aij/seq/aij.h>
273: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274: {
275: Mat B;
276: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
277: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278: const PetscInt *cols;
279: const PetscScalar *vals;
281: PetscFunctionBegin;
282: if (reuse == MAT_REUSE_MATRIX) {
283: B = *newmat;
284: } else {
285: if (PetscDefined(USE_DEBUG) || !a->ilen) {
286: PetscCall(PetscMalloc1(m, &rowlengths));
287: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288: }
289: if (PetscDefined(USE_DEBUG) && a->ilen) {
290: PetscBool eq;
291: PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
292: PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293: PetscCall(PetscFree(rowlengths));
294: rowlengths = a->ilen;
295: } else if (a->ilen) rowlengths = a->ilen;
296: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297: PetscCall(MatSetSizes(B, m, n, m, n));
298: PetscCall(MatSetType(B, MATSEQSELL));
299: PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300: if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301: }
303: for (row = 0; row < m; row++) {
304: PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305: PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306: PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307: }
308: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310: B->rmap->bs = A->rmap->bs;
312: if (reuse == MAT_INPLACE_MATRIX) {
313: PetscCall(MatHeaderReplace(A, &B));
314: } else {
315: *newmat = B;
316: }
317: PetscFunctionReturn(PETSC_SUCCESS);
318: }
320: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321: {
322: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
323: PetscScalar *y;
324: const PetscScalar *x;
325: const MatScalar *aval = a->val;
326: PetscInt totalslices = a->totalslices;
327: const PetscInt *acolidx = a->colidx;
328: PetscInt i, j;
329: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330: __m512d vec_x, vec_y, vec_vals;
331: __m256i vec_idx;
332: __mmask8 mask;
333: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334: __m256i vec_idx2, vec_idx3, vec_idx4;
335: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336: __m128i vec_idx;
337: __m256d vec_x, vec_y, vec_y2, vec_vals;
338: MatScalar yval;
339: PetscInt r, rows_left, row, nnz_in_row;
340: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341: __m128d vec_x_tmp;
342: __m256d vec_x, vec_y, vec_y2, vec_vals;
343: MatScalar yval;
344: PetscInt r, rows_left, row, nnz_in_row;
345: #else
346: PetscInt k, sliceheight = a->sliceheight;
347: PetscScalar *sum;
348: #endif
350: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351: #pragma disjoint(*x, *y, *aval)
352: #endif
354: PetscFunctionBegin;
355: PetscCall(VecGetArrayRead(xx, &x));
356: PetscCall(VecGetArray(yy, &y));
357: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359: for (i = 0; i < totalslices; i++) { /* loop over slices */
360: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
363: vec_y = _mm512_setzero_pd();
364: vec_y2 = _mm512_setzero_pd();
365: vec_y3 = _mm512_setzero_pd();
366: vec_y4 = _mm512_setzero_pd();
368: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370: case 3:
371: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372: acolidx += 8;
373: aval += 8;
374: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375: acolidx += 8;
376: aval += 8;
377: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378: acolidx += 8;
379: aval += 8;
380: j += 3;
381: break;
382: case 2:
383: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384: acolidx += 8;
385: aval += 8;
386: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387: acolidx += 8;
388: aval += 8;
389: j += 2;
390: break;
391: case 1:
392: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393: acolidx += 8;
394: aval += 8;
395: j += 1;
396: break;
397: }
398: #pragma novector
399: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401: acolidx += 8;
402: aval += 8;
403: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404: acolidx += 8;
405: aval += 8;
406: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407: acolidx += 8;
408: aval += 8;
409: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410: acolidx += 8;
411: aval += 8;
412: }
414: vec_y = _mm512_add_pd(vec_y, vec_y2);
415: vec_y = _mm512_add_pd(vec_y, vec_y3);
416: vec_y = _mm512_add_pd(vec_y, vec_y4);
417: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420: } else {
421: _mm512_storeu_pd(&y[8 * i], vec_y);
422: }
423: }
424: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426: for (i = 0; i < totalslices; i++) { /* loop over full slices */
427: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
430: /* last slice may have padding rows. Don't use vectorization. */
431: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432: rows_left = A->rmap->n - 8 * i;
433: for (r = 0; r < rows_left; ++r) {
434: yval = (MatScalar)0;
435: row = 8 * i + r;
436: nnz_in_row = a->rlen[row];
437: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438: y[row] = yval;
439: }
440: break;
441: }
443: vec_y = _mm256_setzero_pd();
444: vec_y2 = _mm256_setzero_pd();
446: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447: #pragma novector
448: #pragma unroll(2)
449: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451: aval += 4;
452: acolidx += 4;
453: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454: aval += 4;
455: acolidx += 4;
456: }
458: _mm256_storeu_pd(y + i * 8, vec_y);
459: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460: }
461: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463: for (i = 0; i < totalslices; i++) { /* loop over full slices */
464: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
467: vec_y = _mm256_setzero_pd();
468: vec_y2 = _mm256_setzero_pd();
470: /* last slice may have padding rows. Don't use vectorization. */
471: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472: rows_left = A->rmap->n - 8 * i;
473: for (r = 0; r < rows_left; ++r) {
474: yval = (MatScalar)0;
475: row = 8 * i + r;
476: nnz_in_row = a->rlen[row];
477: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478: y[row] = yval;
479: }
480: break;
481: }
483: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484: #pragma novector
485: #pragma unroll(2)
486: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487: vec_vals = _mm256_loadu_pd(aval);
488: vec_x_tmp = _mm_setzero_pd();
489: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496: aval += 4;
498: vec_vals = _mm256_loadu_pd(aval);
499: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506: aval += 4;
507: }
509: _mm256_storeu_pd(y + i * 8, vec_y);
510: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511: }
512: #else
513: PetscCall(PetscMalloc1(sliceheight, &sum));
514: for (i = 0; i < totalslices; i++) { /* loop over slices */
515: for (j = 0; j < sliceheight; j++) {
516: sum[j] = 0.0;
517: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518: }
519: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520: for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521: } else {
522: for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523: }
524: }
525: PetscCall(PetscFree(sum));
526: #endif
528: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529: PetscCall(VecRestoreArrayRead(xx, &x));
530: PetscCall(VecRestoreArray(yy, &y));
531: PetscFunctionReturn(PETSC_SUCCESS);
532: }
534: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
535: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536: {
537: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
538: PetscScalar *y, *z;
539: const PetscScalar *x;
540: const MatScalar *aval = a->val;
541: PetscInt totalslices = a->totalslices;
542: const PetscInt *acolidx = a->colidx;
543: PetscInt i, j;
544: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545: __m512d vec_x, vec_y, vec_vals;
546: __m256i vec_idx;
547: __mmask8 mask = 0;
548: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549: __m256i vec_idx2, vec_idx3, vec_idx4;
550: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551: __m128d vec_x_tmp;
552: __m256d vec_x, vec_y, vec_y2, vec_vals;
553: MatScalar yval;
554: PetscInt r, row, nnz_in_row;
555: #else
556: PetscInt k, sliceheight = a->sliceheight;
557: PetscScalar *sum;
558: #endif
560: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561: #pragma disjoint(*x, *y, *aval)
562: #endif
564: PetscFunctionBegin;
565: if (!a->nz) {
566: PetscCall(VecCopy(yy, zz));
567: PetscFunctionReturn(PETSC_SUCCESS);
568: }
569: PetscCall(VecGetArrayRead(xx, &x));
570: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573: for (i = 0; i < totalslices; i++) { /* loop over slices */
574: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
577: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580: } else {
581: vec_y = _mm512_loadu_pd(&y[8 * i]);
582: }
583: vec_y2 = _mm512_setzero_pd();
584: vec_y3 = _mm512_setzero_pd();
585: vec_y4 = _mm512_setzero_pd();
587: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589: case 3:
590: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591: acolidx += 8;
592: aval += 8;
593: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594: acolidx += 8;
595: aval += 8;
596: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597: acolidx += 8;
598: aval += 8;
599: j += 3;
600: break;
601: case 2:
602: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603: acolidx += 8;
604: aval += 8;
605: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606: acolidx += 8;
607: aval += 8;
608: j += 2;
609: break;
610: case 1:
611: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612: acolidx += 8;
613: aval += 8;
614: j += 1;
615: break;
616: }
617: #pragma novector
618: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620: acolidx += 8;
621: aval += 8;
622: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623: acolidx += 8;
624: aval += 8;
625: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626: acolidx += 8;
627: aval += 8;
628: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629: acolidx += 8;
630: aval += 8;
631: }
633: vec_y = _mm512_add_pd(vec_y, vec_y2);
634: vec_y = _mm512_add_pd(vec_y, vec_y3);
635: vec_y = _mm512_add_pd(vec_y, vec_y4);
636: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638: } else {
639: _mm512_storeu_pd(&z[8 * i], vec_y);
640: }
641: }
642: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644: for (i = 0; i < totalslices; i++) { /* loop over full slices */
645: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
648: /* last slice may have padding rows. Don't use vectorization. */
649: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651: row = 8 * i + r;
652: yval = (MatScalar)0.0;
653: nnz_in_row = a->rlen[row];
654: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655: z[row] = y[row] + yval;
656: }
657: break;
658: }
660: vec_y = _mm256_loadu_pd(y + 8 * i);
661: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
663: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665: vec_vals = _mm256_loadu_pd(aval);
666: vec_x_tmp = _mm_setzero_pd();
667: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669: vec_x = _mm256_setzero_pd();
670: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675: aval += 4;
677: vec_vals = _mm256_loadu_pd(aval);
678: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685: aval += 4;
686: }
688: _mm256_storeu_pd(z + i * 8, vec_y);
689: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690: }
691: #else
692: PetscCall(PetscMalloc1(sliceheight, &sum));
693: for (i = 0; i < totalslices; i++) { /* loop over slices */
694: for (j = 0; j < sliceheight; j++) {
695: sum[j] = 0.0;
696: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697: }
698: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699: for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700: } else {
701: for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702: }
703: }
704: PetscCall(PetscFree(sum));
705: #endif
707: PetscCall(PetscLogFlops(2.0 * a->nz));
708: PetscCall(VecRestoreArrayRead(xx, &x));
709: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710: PetscFunctionReturn(PETSC_SUCCESS);
711: }
713: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714: {
715: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
716: PetscScalar *y;
717: const PetscScalar *x;
718: const MatScalar *aval = a->val;
719: const PetscInt *acolidx = a->colidx;
720: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
722: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723: #pragma disjoint(*x, *y, *aval)
724: #endif
726: PetscFunctionBegin;
727: if (A->symmetric == PETSC_BOOL3_TRUE) {
728: PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729: PetscFunctionReturn(PETSC_SUCCESS);
730: }
731: if (zz != yy) PetscCall(VecCopy(zz, yy));
733: if (a->nz) {
734: PetscCall(VecGetArrayRead(xx, &x));
735: PetscCall(VecGetArray(yy, &y));
736: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738: for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739: row = sliceheight * i + r;
740: nnz_in_row = a->rlen[row];
741: for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742: }
743: break;
744: }
745: for (r = 0; r < sliceheight; ++r)
746: for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747: }
748: PetscCall(PetscLogFlops(2.0 * a->nz));
749: PetscCall(VecRestoreArrayRead(xx, &x));
750: PetscCall(VecRestoreArray(yy, &y));
751: }
752: PetscFunctionReturn(PETSC_SUCCESS);
753: }
755: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756: {
757: PetscFunctionBegin;
758: if (A->symmetric == PETSC_BOOL3_TRUE) {
759: PetscCall(MatMult_SeqSELL(A, xx, yy));
760: } else {
761: PetscCall(VecSet(yy, 0.0));
762: PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763: }
764: PetscFunctionReturn(PETSC_SUCCESS);
765: }
767: /*
768: Checks for missing diagonals
769: */
770: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
771: {
772: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773: PetscInt *diag, i;
775: PetscFunctionBegin;
776: *missing = PETSC_FALSE;
777: if (A->rmap->n > 0 && !a->colidx) {
778: *missing = PETSC_TRUE;
779: if (d) *d = 0;
780: PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
781: } else {
782: diag = a->diag;
783: for (i = 0; i < A->rmap->n; i++) {
784: if (diag[i] == -1) {
785: *missing = PETSC_TRUE;
786: if (d) *d = i;
787: PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
788: break;
789: }
790: }
791: }
792: PetscFunctionReturn(PETSC_SUCCESS);
793: }
795: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
796: {
797: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
798: PetscInt i, j, m = A->rmap->n, shift;
800: PetscFunctionBegin;
801: if (!a->diag) {
802: PetscCall(PetscMalloc1(m, &a->diag));
803: a->free_diag = PETSC_TRUE;
804: }
805: for (i = 0; i < m; i++) { /* loop over rows */
806: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
807: a->diag[i] = -1;
808: for (j = 0; j < a->rlen[i]; j++) {
809: if (a->colidx[shift + a->sliceheight * j] == i) {
810: a->diag[i] = shift + a->sliceheight * j;
811: break;
812: }
813: }
814: }
815: PetscFunctionReturn(PETSC_SUCCESS);
816: }
818: /*
819: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
820: */
821: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
822: {
823: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
824: PetscInt i, *diag, m = A->rmap->n;
825: MatScalar *val = a->val;
826: PetscScalar *idiag, *mdiag;
828: PetscFunctionBegin;
829: if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
830: PetscCall(MatMarkDiagonal_SeqSELL(A));
831: diag = a->diag;
832: if (!a->idiag) {
833: PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
834: val = a->val;
835: }
836: mdiag = a->mdiag;
837: idiag = a->idiag;
839: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
840: for (i = 0; i < m; i++) {
841: mdiag[i] = val[diag[i]];
842: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
843: PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
844: PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
845: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
846: A->factorerror_zeropivot_value = 0.0;
847: A->factorerror_zeropivot_row = i;
848: }
849: idiag[i] = 1.0 / val[diag[i]];
850: }
851: PetscCall(PetscLogFlops(m));
852: } else {
853: for (i = 0; i < m; i++) {
854: mdiag[i] = val[diag[i]];
855: idiag[i] = omega / (fshift + val[diag[i]]);
856: }
857: PetscCall(PetscLogFlops(2.0 * m));
858: }
859: a->idiagvalid = PETSC_TRUE;
860: PetscFunctionReturn(PETSC_SUCCESS);
861: }
863: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
864: {
865: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
867: PetscFunctionBegin;
868: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
869: PetscCall(MatSeqSELLInvalidateDiagonal(A));
870: PetscFunctionReturn(PETSC_SUCCESS);
871: }
873: PetscErrorCode MatDestroy_SeqSELL(Mat A)
874: {
875: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
877: PetscFunctionBegin;
878: PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879: PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
880: PetscCall(ISDestroy(&a->row));
881: PetscCall(ISDestroy(&a->col));
882: PetscCall(PetscFree(a->diag));
883: PetscCall(PetscFree(a->rlen));
884: PetscCall(PetscFree(a->sliidx));
885: PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
886: PetscCall(PetscFree(a->solve_work));
887: PetscCall(ISDestroy(&a->icol));
888: PetscCall(PetscFree(a->saved_values));
889: PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
890: PetscCall(PetscFree(A->data));
891: #if defined(PETSC_HAVE_CUPM)
892: PetscCall(PetscFree(a->chunk_slice_map));
893: #endif
895: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
896: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
897: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
898: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
899: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
900: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
901: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
902: #if defined(PETSC_HAVE_CUDA)
903: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
904: #endif
905: #if defined(PETSC_HAVE_HIP)
906: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
907: #endif
908: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
909: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
910: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
911: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
912: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
913: PetscFunctionReturn(PETSC_SUCCESS);
914: }
916: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
917: {
918: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
920: PetscFunctionBegin;
921: switch (op) {
922: case MAT_ROW_ORIENTED:
923: a->roworiented = flg;
924: break;
925: case MAT_KEEP_NONZERO_PATTERN:
926: a->keepnonzeropattern = flg;
927: break;
928: case MAT_NEW_NONZERO_LOCATIONS:
929: a->nonew = (flg ? 0 : 1);
930: break;
931: case MAT_NEW_NONZERO_LOCATION_ERR:
932: a->nonew = (flg ? -1 : 0);
933: break;
934: case MAT_NEW_NONZERO_ALLOCATION_ERR:
935: a->nonew = (flg ? -2 : 0);
936: break;
937: case MAT_UNUSED_NONZERO_LOCATION_ERR:
938: a->nounused = (flg ? -1 : 0);
939: break;
940: case MAT_FORCE_DIAGONAL_ENTRIES:
941: case MAT_IGNORE_OFF_PROC_ENTRIES:
942: case MAT_USE_HASH_TABLE:
943: case MAT_SORTED_FULL:
944: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
945: break;
946: case MAT_SPD:
947: case MAT_SYMMETRIC:
948: case MAT_STRUCTURALLY_SYMMETRIC:
949: case MAT_HERMITIAN:
950: case MAT_SYMMETRY_ETERNAL:
951: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
952: case MAT_SPD_ETERNAL:
953: /* These options are handled directly by MatSetOption() */
954: break;
955: default:
956: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
957: }
958: PetscFunctionReturn(PETSC_SUCCESS);
959: }
961: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
962: {
963: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
964: PetscInt i, j, n, shift;
965: PetscScalar *x, zero = 0.0;
967: PetscFunctionBegin;
968: PetscCall(VecGetLocalSize(v, &n));
969: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
971: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
972: PetscInt *diag = a->diag;
973: PetscCall(VecGetArray(v, &x));
974: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
975: PetscCall(VecRestoreArray(v, &x));
976: PetscFunctionReturn(PETSC_SUCCESS);
977: }
979: PetscCall(VecSet(v, zero));
980: PetscCall(VecGetArray(v, &x));
981: for (i = 0; i < n; i++) { /* loop over rows */
982: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
983: x[i] = 0;
984: for (j = 0; j < a->rlen[i]; j++) {
985: if (a->colidx[shift + a->sliceheight * j] == i) {
986: x[i] = a->val[shift + a->sliceheight * j];
987: break;
988: }
989: }
990: }
991: PetscCall(VecRestoreArray(v, &x));
992: PetscFunctionReturn(PETSC_SUCCESS);
993: }
995: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
996: {
997: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
998: const PetscScalar *l, *r;
999: PetscInt i, j, m, n, row;
1001: PetscFunctionBegin;
1002: if (ll) {
1003: /* The local size is used so that VecMPI can be passed to this routine
1004: by MatDiagonalScale_MPISELL */
1005: PetscCall(VecGetLocalSize(ll, &m));
1006: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1007: PetscCall(VecGetArrayRead(ll, &l));
1008: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1009: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1010: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1011: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1012: }
1013: } else {
1014: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1015: }
1016: }
1017: PetscCall(VecRestoreArrayRead(ll, &l));
1018: PetscCall(PetscLogFlops(a->nz));
1019: }
1020: if (rr) {
1021: PetscCall(VecGetLocalSize(rr, &n));
1022: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1023: PetscCall(VecGetArrayRead(rr, &r));
1024: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1025: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1026: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1027: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1028: }
1029: } else {
1030: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1031: }
1032: }
1033: PetscCall(VecRestoreArrayRead(rr, &r));
1034: PetscCall(PetscLogFlops(a->nz));
1035: }
1036: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1037: #if defined(PETSC_HAVE_CUPM)
1038: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1039: #endif
1040: PetscFunctionReturn(PETSC_SUCCESS);
1041: }
1043: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1044: {
1045: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1046: PetscInt *cp, i, k, low, high, t, row, col, l;
1047: PetscInt shift;
1048: MatScalar *vp;
1050: PetscFunctionBegin;
1051: for (k = 0; k < m; k++) { /* loop over requested rows */
1052: row = im[k];
1053: if (row < 0) continue;
1054: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1055: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1056: cp = a->colidx + shift; /* pointer to the row */
1057: vp = a->val + shift; /* pointer to the row */
1058: for (l = 0; l < n; l++) { /* loop over requested columns */
1059: col = in[l];
1060: if (col < 0) continue;
1061: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1062: high = a->rlen[row];
1063: low = 0; /* assume unsorted */
1064: while (high - low > 5) {
1065: t = (low + high) / 2;
1066: if (*(cp + a->sliceheight * t) > col) high = t;
1067: else low = t;
1068: }
1069: for (i = low; i < high; i++) {
1070: if (*(cp + a->sliceheight * i) > col) break;
1071: if (*(cp + a->sliceheight * i) == col) {
1072: *v++ = *(vp + a->sliceheight * i);
1073: goto finished;
1074: }
1075: }
1076: *v++ = 0.0;
1077: finished:;
1078: }
1079: }
1080: PetscFunctionReturn(PETSC_SUCCESS);
1081: }
1083: static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1084: {
1085: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1086: PetscInt i, j, m = A->rmap->n, shift;
1087: const char *name;
1088: PetscViewerFormat format;
1090: PetscFunctionBegin;
1091: PetscCall(PetscViewerGetFormat(viewer, &format));
1092: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1093: PetscInt nofinalvalue = 0;
1094: /*
1095: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1096: nofinalvalue = 1;
1097: }
1098: */
1099: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1100: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1101: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1102: #if defined(PETSC_USE_COMPLEX)
1103: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1104: #else
1105: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1106: #endif
1107: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1109: for (i = 0; i < m; i++) {
1110: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1111: for (j = 0; j < a->rlen[i]; j++) {
1112: #if defined(PETSC_USE_COMPLEX)
1113: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1114: #else
1115: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1116: #endif
1117: }
1118: }
1119: /*
1120: if (nofinalvalue) {
1121: #if defined(PETSC_USE_COMPLEX)
1122: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1123: #else
1124: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1125: #endif
1126: }
1127: */
1128: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1129: PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1130: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1131: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1132: PetscFunctionReturn(PETSC_SUCCESS);
1133: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1134: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1135: for (i = 0; i < m; i++) {
1136: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1137: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1138: for (j = 0; j < a->rlen[i]; j++) {
1139: #if defined(PETSC_USE_COMPLEX)
1140: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1141: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1142: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1143: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1144: } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1145: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1146: }
1147: #else
1148: if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1149: #endif
1150: }
1151: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152: }
1153: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1155: PetscInt cnt = 0, jcnt;
1156: PetscScalar value;
1157: #if defined(PETSC_USE_COMPLEX)
1158: PetscBool realonly = PETSC_TRUE;
1159: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1160: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1161: realonly = PETSC_FALSE;
1162: break;
1163: }
1164: }
1165: #endif
1167: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1168: for (i = 0; i < m; i++) {
1169: jcnt = 0;
1170: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1171: for (j = 0; j < A->cmap->n; j++) {
1172: if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1173: value = a->val[cnt++];
1174: jcnt++;
1175: } else {
1176: value = 0.0;
1177: }
1178: #if defined(PETSC_USE_COMPLEX)
1179: if (realonly) {
1180: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1181: } else {
1182: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1183: }
1184: #else
1185: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1186: #endif
1187: }
1188: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1189: }
1190: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1191: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1192: PetscInt fshift = 1;
1193: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194: #if defined(PETSC_USE_COMPLEX)
1195: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1196: #else
1197: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1198: #endif
1199: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1200: for (i = 0; i < m; i++) {
1201: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1202: for (j = 0; j < a->rlen[i]; j++) {
1203: #if defined(PETSC_USE_COMPLEX)
1204: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1205: #else
1206: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1207: #endif
1208: }
1209: }
1210: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1211: } else if (format == PETSC_VIEWER_NATIVE) {
1212: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1213: PetscInt row;
1214: PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1215: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1216: #if defined(PETSC_USE_COMPLEX)
1217: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1219: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1221: } else {
1222: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1223: }
1224: #else
1225: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1226: #endif
1227: }
1228: }
1229: } else {
1230: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1231: if (A->factortype) {
1232: for (i = 0; i < m; i++) {
1233: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1234: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1235: /* L part */
1236: for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1237: #if defined(PETSC_USE_COMPLEX)
1238: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1239: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1240: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1241: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1242: } else {
1243: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1244: }
1245: #else
1246: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1247: #endif
1248: }
1249: /* diagonal */
1250: j = a->diag[i];
1251: #if defined(PETSC_USE_COMPLEX)
1252: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1253: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1254: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1255: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1256: } else {
1257: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1258: }
1259: #else
1260: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1261: #endif
1263: /* U part */
1264: for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1265: #if defined(PETSC_USE_COMPLEX)
1266: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1267: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1268: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1269: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1270: } else {
1271: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1272: }
1273: #else
1274: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1275: #endif
1276: }
1277: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1278: }
1279: } else {
1280: for (i = 0; i < m; i++) {
1281: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1282: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1283: for (j = 0; j < a->rlen[i]; j++) {
1284: #if defined(PETSC_USE_COMPLEX)
1285: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1286: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1287: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1288: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1289: } else {
1290: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1291: }
1292: #else
1293: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1294: #endif
1295: }
1296: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1297: }
1298: }
1299: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1300: }
1301: PetscCall(PetscViewerFlush(viewer));
1302: PetscFunctionReturn(PETSC_SUCCESS);
1303: }
1305: #include <petscdraw.h>
1306: static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1307: {
1308: Mat A = (Mat)Aa;
1309: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1310: PetscInt i, j, m = A->rmap->n, shift;
1311: int color;
1312: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1313: PetscViewer viewer;
1314: PetscViewerFormat format;
1316: PetscFunctionBegin;
1317: PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1318: PetscCall(PetscViewerGetFormat(viewer, &format));
1319: PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1321: /* loop over matrix elements drawing boxes */
1323: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1324: PetscDrawCollectiveBegin(draw);
1325: /* Blue for negative, Cyan for zero and Red for positive */
1326: color = PETSC_DRAW_BLUE;
1327: for (i = 0; i < m; i++) {
1328: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1329: y_l = m - i - 1.0;
1330: y_r = y_l + 1.0;
1331: for (j = 0; j < a->rlen[i]; j++) {
1332: x_l = a->colidx[shift + a->sliceheight * j];
1333: x_r = x_l + 1.0;
1334: if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1335: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1336: }
1337: }
1338: color = PETSC_DRAW_CYAN;
1339: for (i = 0; i < m; i++) {
1340: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1341: y_l = m - i - 1.0;
1342: y_r = y_l + 1.0;
1343: for (j = 0; j < a->rlen[i]; j++) {
1344: x_l = a->colidx[shift + a->sliceheight * j];
1345: x_r = x_l + 1.0;
1346: if (a->val[shift + a->sliceheight * j] != 0.) continue;
1347: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1348: }
1349: }
1350: color = PETSC_DRAW_RED;
1351: for (i = 0; i < m; i++) {
1352: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1353: y_l = m - i - 1.0;
1354: y_r = y_l + 1.0;
1355: for (j = 0; j < a->rlen[i]; j++) {
1356: x_l = a->colidx[shift + a->sliceheight * j];
1357: x_r = x_l + 1.0;
1358: if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1359: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1360: }
1361: }
1362: PetscDrawCollectiveEnd(draw);
1363: } else {
1364: /* use contour shading to indicate magnitude of values */
1365: /* first determine max of all nonzero values */
1366: PetscReal minv = 0.0, maxv = 0.0;
1367: PetscInt count = 0;
1368: PetscDraw popup;
1369: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1370: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1371: }
1372: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1373: PetscCall(PetscDrawGetPopup(draw, &popup));
1374: PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1376: PetscDrawCollectiveBegin(draw);
1377: for (i = 0; i < m; i++) {
1378: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1379: y_l = m - i - 1.0;
1380: y_r = y_l + 1.0;
1381: for (j = 0; j < a->rlen[i]; j++) {
1382: x_l = a->colidx[shift + a->sliceheight * j];
1383: x_r = x_l + 1.0;
1384: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1385: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1386: count++;
1387: }
1388: }
1389: PetscDrawCollectiveEnd(draw);
1390: }
1391: PetscFunctionReturn(PETSC_SUCCESS);
1392: }
1394: #include <petscdraw.h>
1395: static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1396: {
1397: PetscDraw draw;
1398: PetscReal xr, yr, xl, yl, h, w;
1399: PetscBool isnull;
1401: PetscFunctionBegin;
1402: PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1403: PetscCall(PetscDrawIsNull(draw, &isnull));
1404: if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1406: xr = A->cmap->n;
1407: yr = A->rmap->n;
1408: h = yr / 10.0;
1409: w = xr / 10.0;
1410: xr += w;
1411: yr += h;
1412: xl = -w;
1413: yl = -h;
1414: PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1415: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1416: PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1417: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1418: PetscCall(PetscDrawSave(draw));
1419: PetscFunctionReturn(PETSC_SUCCESS);
1420: }
1422: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1423: {
1424: PetscBool iascii, isbinary, isdraw;
1426: PetscFunctionBegin;
1427: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1428: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1429: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1430: if (iascii) {
1431: PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1432: } else if (isbinary) {
1433: /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1434: } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1435: PetscFunctionReturn(PETSC_SUCCESS);
1436: }
1438: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1439: {
1440: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1441: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1442: MatScalar *vp;
1443: #if defined(PETSC_HAVE_CUPM)
1444: PetscInt totalchunks = 0;
1445: #endif
1447: PetscFunctionBegin;
1448: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1449: /* To do: compress out the unused elements */
1450: PetscCall(MatMarkDiagonal_SeqSELL(A));
1451: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1452: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1453: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1454: a->nonzerorowcnt = 0;
1455: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1456: for (i = 0; i < a->totalslices; ++i) {
1457: shift = a->sliidx[i]; /* starting index of the slice */
1458: cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */
1459: vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */
1460: for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1461: row = a->sliceheight * i + row_in_slice;
1462: nrow = a->rlen[row]; /* number of nonzeros in row */
1463: /*
1464: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1465: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1466: */
1467: lastcol = 0;
1468: if (nrow > 0) { /* nonempty row */
1469: a->nonzerorowcnt++;
1470: lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1471: } else if (!row_in_slice) { /* first row of the correct slice is empty */
1472: for (j = 1; j < a->sliceheight; j++) {
1473: if (a->rlen[a->sliceheight * i + j]) {
1474: lastcol = cp[j];
1475: break;
1476: }
1477: }
1478: } else {
1479: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1480: }
1482: for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1483: cp[a->sliceheight * k + row_in_slice] = lastcol;
1484: vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1485: }
1486: }
1487: }
1489: A->info.mallocs += a->reallocs;
1490: a->reallocs = 0;
1492: PetscCall(MatSeqSELLInvalidateDiagonal(A));
1493: #if defined(PETSC_HAVE_CUPM)
1494: if (!a->chunksize && a->totalslices) {
1495: a->chunksize = 64;
1496: while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1497: totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1498: }
1499: if (totalchunks != a->totalchunks) {
1500: PetscCall(PetscFree(a->chunk_slice_map));
1501: PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1502: a->totalchunks = totalchunks;
1503: }
1504: j = 0;
1505: for (i = 0; i < totalchunks; i++) {
1506: while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1507: a->chunk_slice_map[i] = j;
1508: }
1509: #endif
1510: PetscFunctionReturn(PETSC_SUCCESS);
1511: }
1513: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1514: {
1515: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1517: PetscFunctionBegin;
1518: info->block_size = 1.0;
1519: info->nz_allocated = a->maxallocmat;
1520: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1521: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1522: info->assemblies = A->num_ass;
1523: info->mallocs = A->info.mallocs;
1524: info->memory = 0; /* REVIEW ME */
1525: if (A->factortype) {
1526: info->fill_ratio_given = A->info.fill_ratio_given;
1527: info->fill_ratio_needed = A->info.fill_ratio_needed;
1528: info->factor_mallocs = A->info.factor_mallocs;
1529: } else {
1530: info->fill_ratio_given = 0;
1531: info->fill_ratio_needed = 0;
1532: info->factor_mallocs = 0;
1533: }
1534: PetscFunctionReturn(PETSC_SUCCESS);
1535: }
1537: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1538: {
1539: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1540: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1541: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1542: MatScalar *vp, value;
1543: #if defined(PETSC_HAVE_CUPM)
1544: PetscBool inserted = PETSC_FALSE;
1545: PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight;
1546: #endif
1548: PetscFunctionBegin;
1549: for (k = 0; k < m; k++) { /* loop over added rows */
1550: row = im[k];
1551: if (row < 0) continue;
1552: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1553: shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1554: cp = a->colidx + shift; /* pointer to the row */
1555: vp = a->val + shift; /* pointer to the row */
1556: nrow = a->rlen[row];
1557: low = 0;
1558: high = nrow;
1560: for (l = 0; l < n; l++) { /* loop over added columns */
1561: col = in[l];
1562: if (col < 0) continue;
1563: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1564: if (a->roworiented) {
1565: value = v[l + k * n];
1566: } else {
1567: value = v[k + l * m];
1568: }
1569: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1571: /* search in this row for the specified column, i indicates the column to be set */
1572: if (col <= lastcol) low = 0;
1573: else high = nrow;
1574: lastcol = col;
1575: while (high - low > 5) {
1576: t = (low + high) / 2;
1577: if (*(cp + a->sliceheight * t) > col) high = t;
1578: else low = t;
1579: }
1580: for (i = low; i < high; i++) {
1581: if (*(cp + a->sliceheight * i) > col) break;
1582: if (*(cp + a->sliceheight * i) == col) {
1583: if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1584: else *(vp + a->sliceheight * i) = value;
1585: #if defined(PETSC_HAVE_CUPM)
1586: inserted = PETSC_TRUE;
1587: #endif
1588: low = i + 1;
1589: goto noinsert;
1590: }
1591: }
1592: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1593: if (nonew == 1) goto noinsert;
1594: PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1595: #if defined(PETSC_HAVE_CUPM)
1596: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1597: #else
1598: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1599: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1600: #endif
1601: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1602: for (ii = nrow - 1; ii >= i; ii--) {
1603: *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1604: *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1605: }
1606: a->rlen[row]++;
1607: *(cp + a->sliceheight * i) = col;
1608: *(vp + a->sliceheight * i) = value;
1609: a->nz++;
1610: #if defined(PETSC_HAVE_CUPM)
1611: inserted = PETSC_TRUE;
1612: #endif
1613: low = i + 1;
1614: high++;
1615: nrow++;
1616: noinsert:;
1617: }
1618: a->rlen[row] = nrow;
1619: }
1620: #if defined(PETSC_HAVE_CUPM)
1621: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1622: #endif
1623: PetscFunctionReturn(PETSC_SUCCESS);
1624: }
1626: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1627: {
1628: PetscFunctionBegin;
1629: /* If the two matrices have the same copy implementation, use fast copy. */
1630: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1631: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1632: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1634: PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1635: PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1636: } else {
1637: PetscCall(MatCopy_Basic(A, B, str));
1638: }
1639: PetscFunctionReturn(PETSC_SUCCESS);
1640: }
1642: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1643: {
1644: PetscFunctionBegin;
1645: PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1646: PetscFunctionReturn(PETSC_SUCCESS);
1647: }
1649: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1650: {
1651: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1653: PetscFunctionBegin;
1654: *array = a->val;
1655: PetscFunctionReturn(PETSC_SUCCESS);
1656: }
1658: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1659: {
1660: PetscFunctionBegin;
1661: PetscFunctionReturn(PETSC_SUCCESS);
1662: }
1664: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1665: {
1666: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1667: MatScalar *aval = a->val;
1668: PetscScalar oalpha = alpha;
1669: PetscBLASInt one = 1, size;
1671: PetscFunctionBegin;
1672: PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1673: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1674: PetscCall(PetscLogFlops(a->nz));
1675: PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1676: #if defined(PETSC_HAVE_CUPM)
1677: if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1678: #endif
1679: PetscFunctionReturn(PETSC_SUCCESS);
1680: }
1682: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1683: {
1684: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1686: PetscFunctionBegin;
1687: if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1688: PetscCall(MatShift_Basic(Y, a));
1689: PetscFunctionReturn(PETSC_SUCCESS);
1690: }
1692: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1693: {
1694: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1695: PetscScalar *x, sum, *t;
1696: const MatScalar *idiag = NULL, *mdiag;
1697: const PetscScalar *b, *xb;
1698: PetscInt n, m = A->rmap->n, i, j, shift;
1699: const PetscInt *diag;
1701: PetscFunctionBegin;
1702: its = its * lits;
1704: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1705: if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1706: a->fshift = fshift;
1707: a->omega = omega;
1709: diag = a->diag;
1710: t = a->ssor_work;
1711: idiag = a->idiag;
1712: mdiag = a->mdiag;
1714: PetscCall(VecGetArray(xx, &x));
1715: PetscCall(VecGetArrayRead(bb, &b));
1716: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1717: PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1718: PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1719: PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1721: if (flag & SOR_ZERO_INITIAL_GUESS) {
1722: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1723: for (i = 0; i < m; i++) {
1724: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1725: sum = b[i];
1726: n = (diag[i] - shift) / a->sliceheight;
1727: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1728: t[i] = sum;
1729: x[i] = sum * idiag[i];
1730: }
1731: xb = t;
1732: PetscCall(PetscLogFlops(a->nz));
1733: } else xb = b;
1734: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1735: for (i = m - 1; i >= 0; i--) {
1736: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1737: sum = xb[i];
1738: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1739: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1740: if (xb == b) {
1741: x[i] = sum * idiag[i];
1742: } else {
1743: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1744: }
1745: }
1746: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1747: }
1748: its--;
1749: }
1750: while (its--) {
1751: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1752: for (i = 0; i < m; i++) {
1753: /* lower */
1754: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1755: sum = b[i];
1756: n = (diag[i] - shift) / a->sliceheight;
1757: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1758: t[i] = sum; /* save application of the lower-triangular part */
1759: /* upper */
1760: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1761: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1762: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1763: }
1764: xb = t;
1765: PetscCall(PetscLogFlops(2.0 * a->nz));
1766: } else xb = b;
1767: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1768: for (i = m - 1; i >= 0; i--) {
1769: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1770: sum = xb[i];
1771: if (xb == b) {
1772: /* whole matrix (no checkpointing available) */
1773: n = a->rlen[i];
1774: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1775: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1776: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1777: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1778: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1779: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1780: }
1781: }
1782: if (xb == b) {
1783: PetscCall(PetscLogFlops(2.0 * a->nz));
1784: } else {
1785: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1786: }
1787: }
1788: }
1789: PetscCall(VecRestoreArray(xx, &x));
1790: PetscCall(VecRestoreArrayRead(bb, &b));
1791: PetscFunctionReturn(PETSC_SUCCESS);
1792: }
1794: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1795: MatGetRow_SeqSELL,
1796: MatRestoreRow_SeqSELL,
1797: MatMult_SeqSELL,
1798: /* 4*/ MatMultAdd_SeqSELL,
1799: MatMultTranspose_SeqSELL,
1800: MatMultTransposeAdd_SeqSELL,
1801: NULL,
1802: NULL,
1803: NULL,
1804: /* 10*/ NULL,
1805: NULL,
1806: NULL,
1807: MatSOR_SeqSELL,
1808: NULL,
1809: /* 15*/ MatGetInfo_SeqSELL,
1810: MatEqual_SeqSELL,
1811: MatGetDiagonal_SeqSELL,
1812: MatDiagonalScale_SeqSELL,
1813: NULL,
1814: /* 20*/ NULL,
1815: MatAssemblyEnd_SeqSELL,
1816: MatSetOption_SeqSELL,
1817: MatZeroEntries_SeqSELL,
1818: /* 24*/ NULL,
1819: NULL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: /* 29*/ MatSetUp_SeqSELL,
1824: NULL,
1825: NULL,
1826: NULL,
1827: NULL,
1828: /* 34*/ MatDuplicate_SeqSELL,
1829: NULL,
1830: NULL,
1831: NULL,
1832: NULL,
1833: /* 39*/ NULL,
1834: NULL,
1835: NULL,
1836: MatGetValues_SeqSELL,
1837: MatCopy_SeqSELL,
1838: /* 44*/ NULL,
1839: MatScale_SeqSELL,
1840: MatShift_SeqSELL,
1841: NULL,
1842: NULL,
1843: /* 49*/ NULL,
1844: NULL,
1845: NULL,
1846: NULL,
1847: NULL,
1848: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1849: NULL,
1850: NULL,
1851: NULL,
1852: NULL,
1853: /* 59*/ NULL,
1854: MatDestroy_SeqSELL,
1855: MatView_SeqSELL,
1856: NULL,
1857: NULL,
1858: /* 64*/ NULL,
1859: NULL,
1860: NULL,
1861: NULL,
1862: NULL,
1863: /* 69*/ NULL,
1864: NULL,
1865: NULL,
1866: NULL,
1867: NULL,
1868: /* 74*/ NULL,
1869: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1870: NULL,
1871: NULL,
1872: NULL,
1873: /* 79*/ NULL,
1874: NULL,
1875: NULL,
1876: NULL,
1877: NULL,
1878: /* 84*/ NULL,
1879: NULL,
1880: NULL,
1881: NULL,
1882: NULL,
1883: /* 89*/ NULL,
1884: NULL,
1885: NULL,
1886: NULL,
1887: NULL,
1888: /* 94*/ NULL,
1889: NULL,
1890: NULL,
1891: NULL,
1892: NULL,
1893: /* 99*/ NULL,
1894: NULL,
1895: NULL,
1896: MatConjugate_SeqSELL,
1897: NULL,
1898: /*104*/ NULL,
1899: NULL,
1900: NULL,
1901: NULL,
1902: NULL,
1903: /*109*/ NULL,
1904: NULL,
1905: NULL,
1906: NULL,
1907: MatMissingDiagonal_SeqSELL,
1908: /*114*/ NULL,
1909: NULL,
1910: NULL,
1911: NULL,
1912: NULL,
1913: /*119*/ NULL,
1914: NULL,
1915: NULL,
1916: NULL,
1917: NULL,
1918: /*124*/ NULL,
1919: NULL,
1920: NULL,
1921: NULL,
1922: NULL,
1923: /*129*/ NULL,
1924: NULL,
1925: NULL,
1926: NULL,
1927: NULL,
1928: /*134*/ NULL,
1929: NULL,
1930: NULL,
1931: NULL,
1932: NULL,
1933: /*139*/ NULL,
1934: NULL,
1935: NULL,
1936: MatFDColoringSetUp_SeqXAIJ,
1937: NULL,
1938: /*144*/ NULL,
1939: NULL,
1940: NULL,
1941: NULL,
1942: NULL,
1943: NULL,
1944: /*150*/ NULL,
1945: NULL,
1946: NULL,
1947: NULL,
1948: NULL,
1949: /*155*/ NULL,
1950: NULL};
1952: static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1953: {
1954: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1956: PetscFunctionBegin;
1957: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1959: /* allocate space for values if not already there */
1960: if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1962: /* copy values over */
1963: PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1964: PetscFunctionReturn(PETSC_SUCCESS);
1965: }
1967: static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1968: {
1969: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1971: PetscFunctionBegin;
1972: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1973: PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1974: PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1975: PetscFunctionReturn(PETSC_SUCCESS);
1976: }
1978: static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1979: {
1980: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1982: PetscFunctionBegin;
1983: if (a->totalslices && a->sliidx[a->totalslices]) {
1984: *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1985: } else {
1986: *ratio = 0.0;
1987: }
1988: PetscFunctionReturn(PETSC_SUCCESS);
1989: }
1991: static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1992: {
1993: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1994: PetscInt i, current_slicewidth;
1996: PetscFunctionBegin;
1997: *slicewidth = 0;
1998: for (i = 0; i < a->totalslices; i++) {
1999: current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2000: if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2001: }
2002: PetscFunctionReturn(PETSC_SUCCESS);
2003: }
2005: static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2006: {
2007: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2009: PetscFunctionBegin;
2010: *slicewidth = 0;
2011: if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2012: PetscFunctionReturn(PETSC_SUCCESS);
2013: }
2015: static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2016: {
2017: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2018: PetscReal mean;
2019: PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx;
2021: PetscFunctionBegin;
2022: *variance = 0;
2023: if (totalslices) {
2024: mean = (PetscReal)sliidx[totalslices] / totalslices;
2025: for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2026: }
2027: PetscFunctionReturn(PETSC_SUCCESS);
2028: }
2030: static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2031: {
2032: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2034: PetscFunctionBegin;
2035: if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2036: PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2037: a->sliceheight = sliceheight;
2038: #if defined(PETSC_HAVE_CUPM)
2039: PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2040: #endif
2041: PetscFunctionReturn(PETSC_SUCCESS);
2042: }
2044: /*@
2045: MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2047: Not Collective
2049: Input Parameter:
2050: . A - a MATSEQSELL matrix
2052: Output Parameter:
2053: . ratio - ratio of number of padded zeros to number of allocated elements
2055: Level: intermediate
2057: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2058: @*/
2059: PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2060: {
2061: PetscFunctionBegin;
2062: PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2063: PetscFunctionReturn(PETSC_SUCCESS);
2064: }
2066: /*@
2067: MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2069: Not Collective
2071: Input Parameter:
2072: . A - a MATSEQSELL matrix
2074: Output Parameter:
2075: . slicewidth - maximum slice width
2077: Level: intermediate
2079: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2080: @*/
2081: PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2082: {
2083: PetscFunctionBegin;
2084: PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2085: PetscFunctionReturn(PETSC_SUCCESS);
2086: }
2088: /*@
2089: MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2091: Not Collective
2093: Input Parameter:
2094: . A - a MATSEQSELL matrix
2096: Output Parameter:
2097: . slicewidth - average slice width
2099: Level: intermediate
2101: .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2102: @*/
2103: PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2104: {
2105: PetscFunctionBegin;
2106: PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2107: PetscFunctionReturn(PETSC_SUCCESS);
2108: }
2110: /*@
2111: MatSeqSELLSetSliceHeight - sets the slice height.
2113: Not Collective
2115: Input Parameters:
2116: + A - a MATSEQSELL matrix
2117: - sliceheight - slice height
2119: Notes:
2120: You cannot change the slice height once it have been set.
2122: The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2124: Level: intermediate
2126: .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2127: @*/
2128: PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2129: {
2130: PetscFunctionBegin;
2131: PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2132: PetscFunctionReturn(PETSC_SUCCESS);
2133: }
2135: /*@
2136: MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2138: Not Collective
2140: Input Parameter:
2141: . A - a MATSEQSELL matrix
2143: Output Parameter:
2144: . variance - variance of the slice size
2146: Level: intermediate
2148: .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2149: @*/
2150: PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2151: {
2152: PetscFunctionBegin;
2153: PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2154: PetscFunctionReturn(PETSC_SUCCESS);
2155: }
2157: #if defined(PETSC_HAVE_CUDA)
2158: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2159: #endif
2160: #if defined(PETSC_HAVE_HIP)
2161: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2162: #endif
2164: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2165: {
2166: Mat_SeqSELL *b;
2167: PetscMPIInt size;
2169: PetscFunctionBegin;
2170: PetscCall(PetscCitationsRegister(citation, &cited));
2171: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2172: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2174: PetscCall(PetscNew(&b));
2176: B->data = (void *)b;
2177: B->ops[0] = MatOps_Values;
2179: b->row = NULL;
2180: b->col = NULL;
2181: b->icol = NULL;
2182: b->reallocs = 0;
2183: b->ignorezeroentries = PETSC_FALSE;
2184: b->roworiented = PETSC_TRUE;
2185: b->nonew = 0;
2186: b->diag = NULL;
2187: b->solve_work = NULL;
2188: B->spptr = NULL;
2189: b->saved_values = NULL;
2190: b->idiag = NULL;
2191: b->mdiag = NULL;
2192: b->ssor_work = NULL;
2193: b->omega = 1.0;
2194: b->fshift = 0.0;
2195: b->idiagvalid = PETSC_FALSE;
2196: b->keepnonzeropattern = PETSC_FALSE;
2197: b->sliceheight = 0;
2199: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2200: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2201: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2202: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2203: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2204: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2205: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2206: #if defined(PETSC_HAVE_CUDA)
2207: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2208: #endif
2209: #if defined(PETSC_HAVE_HIP)
2210: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2211: #endif
2212: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2213: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2214: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2215: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2216: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2218: PetscObjectOptionsBegin((PetscObject)B);
2219: {
2220: PetscInt newsh = -1;
2221: PetscBool flg;
2222: #if defined(PETSC_HAVE_CUPM)
2223: PetscInt chunksize = 0;
2224: #endif
2226: PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2227: if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2228: #if defined(PETSC_HAVE_CUPM)
2229: PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2230: if (flg) {
2231: PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2232: b->chunksize = chunksize;
2233: }
2234: #endif
2235: }
2236: PetscOptionsEnd();
2237: PetscFunctionReturn(PETSC_SUCCESS);
2238: }
2240: /*
2241: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2242: */
2243: static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2244: {
2245: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2246: PetscInt i, m = A->rmap->n;
2247: PetscInt totalslices = a->totalslices;
2249: PetscFunctionBegin;
2250: C->factortype = A->factortype;
2251: c->row = NULL;
2252: c->col = NULL;
2253: c->icol = NULL;
2254: c->reallocs = 0;
2255: C->assembled = PETSC_TRUE;
2257: PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2258: PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2260: c->sliceheight = a->sliceheight;
2261: PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2262: PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2264: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2265: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2267: /* allocate the matrix space */
2268: if (mallocmatspace) {
2269: PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2271: c->singlemalloc = PETSC_TRUE;
2273: if (m > 0) {
2274: PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2275: if (cpvalues == MAT_COPY_VALUES) {
2276: PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2277: } else {
2278: PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2279: }
2280: }
2281: }
2283: c->ignorezeroentries = a->ignorezeroentries;
2284: c->roworiented = a->roworiented;
2285: c->nonew = a->nonew;
2286: if (a->diag) {
2287: PetscCall(PetscMalloc1(m, &c->diag));
2288: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2289: } else c->diag = NULL;
2291: c->solve_work = NULL;
2292: c->saved_values = NULL;
2293: c->idiag = NULL;
2294: c->ssor_work = NULL;
2295: c->keepnonzeropattern = a->keepnonzeropattern;
2296: c->free_val = PETSC_TRUE;
2297: c->free_colidx = PETSC_TRUE;
2299: c->maxallocmat = a->maxallocmat;
2300: c->maxallocrow = a->maxallocrow;
2301: c->rlenmax = a->rlenmax;
2302: c->nz = a->nz;
2303: C->preallocated = PETSC_TRUE;
2305: c->nonzerorowcnt = a->nonzerorowcnt;
2306: C->nonzerostate = A->nonzerostate;
2308: PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2309: PetscFunctionReturn(PETSC_SUCCESS);
2310: }
2312: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2313: {
2314: PetscFunctionBegin;
2315: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2316: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2317: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2319: PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2320: PetscFunctionReturn(PETSC_SUCCESS);
2321: }
2323: /*MC
2324: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2325: based on the sliced Ellpack format, {cite}`zhangellpack2018`
2327: Options Database Key:
2328: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2330: Level: beginner
2332: .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2333: M*/
2335: /*MC
2336: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2338: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2339: and `MATMPISELL` otherwise. As a result, for single process communicators,
2340: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2341: for communicators controlling multiple processes. It is recommended that you call both of
2342: the above preallocation routines for simplicity.
2344: Options Database Key:
2345: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2347: Level: beginner
2349: Notes:
2350: This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2352: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2353: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2355: Developer Notes:
2356: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2358: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2359: .vb
2360: (2 0 3 4)
2361: Consider the matrix A = (5 0 6 0)
2362: (0 0 7 8)
2363: (0 0 9 9)
2365: symbolically the Ellpack format can be written as
2367: (2 3 4 |) (0 2 3 |)
2368: v = (5 6 0 |) colidx = (0 2 2 |)
2369: -------- ---------
2370: (7 8 |) (2 3 |)
2371: (9 9 |) (2 3 |)
2373: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2374: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2375: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2377: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2379: .ve
2381: See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2383: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2384: M*/
2386: /*@
2387: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2389: Collective
2391: Input Parameters:
2392: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2393: . m - number of rows
2394: . n - number of columns
2395: . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2396: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2398: Output Parameter:
2399: . A - the matrix
2401: Level: intermediate
2403: Notes:
2404: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2405: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2406: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2408: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2409: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2410: allocation.
2412: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2413: @*/
2414: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2415: {
2416: PetscFunctionBegin;
2417: PetscCall(MatCreate(comm, A));
2418: PetscCall(MatSetSizes(*A, m, n, m, n));
2419: PetscCall(MatSetType(*A, MATSEQSELL));
2420: PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2421: PetscFunctionReturn(PETSC_SUCCESS);
2422: }
2424: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2425: {
2426: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2427: PetscInt totalslices = a->totalslices;
2429: PetscFunctionBegin;
2430: /* If the matrix dimensions are not equal,or no of nonzeros */
2431: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2432: *flg = PETSC_FALSE;
2433: PetscFunctionReturn(PETSC_SUCCESS);
2434: }
2435: /* if the a->colidx are the same */
2436: PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2437: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2438: /* if a->val are the same */
2439: PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2440: PetscFunctionReturn(PETSC_SUCCESS);
2441: }
2443: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2444: {
2445: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2447: PetscFunctionBegin;
2448: a->idiagvalid = PETSC_FALSE;
2449: PetscFunctionReturn(PETSC_SUCCESS);
2450: }
2452: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2453: {
2454: #if defined(PETSC_USE_COMPLEX)
2455: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2456: PetscInt i;
2457: PetscScalar *val = a->val;
2459: PetscFunctionBegin;
2460: for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2461: #if defined(PETSC_HAVE_CUPM)
2462: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2463: #endif
2464: #else
2465: PetscFunctionBegin;
2466: #endif
2467: PetscFunctionReturn(PETSC_SUCCESS);
2468: }