Actual source code: sell.c
1: /*
2: Defines the basic matrix operations for the SELL matrix storage format.
3: */
4: #include <../src/mat/impls/sell/seq/sell.h>
5: #include <petscblaslapack.h>
6: #include <petsc/private/kernels/blocktranspose.h>
8: static PetscBool cited = PETSC_FALSE;
9: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13: " year = 2018\n"
14: "}\n";
16: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18: #include <immintrin.h>
20: #if !defined(_MM_SCALE_8)
21: #define _MM_SCALE_8 8
22: #endif
24: #if defined(__AVX512F__)
25: /* these do not work
26: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28: */
29: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
32: vec_vals = _mm512_loadu_pd(aval); \
33: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35: #elif defined(__AVX2__) && defined(__FMA__)
36: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37: vec_vals = _mm256_loadu_pd(aval); \
38: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41: #endif
42: #endif /* PETSC_HAVE_IMMINTRIN_H */
44: /*@
45: MatSeqSELLSetPreallocation - For good matrix assembly performance
46: the user should preallocate the matrix storage by setting the parameter `nz`
47: (or the array `nnz`).
49: Collective
51: Input Parameters:
52: + B - The `MATSEQSELL` matrix
53: . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56: Level: intermediate
58: Notes:
59: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61: allocation.
63: You can call `MatGetInfo()` to get information on how effective the preallocation was;
64: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65: You can also run with the option `-info` and look for messages with the string
66: malloc in them to see if additional memory allocation was needed.
68: Developer Notes:
69: Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70: entries or columns indices.
72: The maximum number of nonzeos in any row should be as accurate as possible.
73: If it is underestimated, you will get bad performance due to reallocation
74: (`MatSeqXSELLReallocateSELL()`).
76: .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77: @*/
78: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79: {
80: PetscFunctionBegin;
83: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88: {
89: Mat_SeqSELL *b;
90: PetscInt i, j, totalslices;
91: #if defined(PETSC_HAVE_CUPM)
92: PetscInt rlenmax = 0;
93: #endif
94: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96: PetscFunctionBegin;
97: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98: if (maxallocrow == MAT_SKIP_ALLOCATION) {
99: skipallocation = PETSC_TRUE;
100: maxallocrow = 0;
101: }
103: PetscCall(PetscLayoutSetUp(B->rmap));
104: PetscCall(PetscLayoutSetUp(B->cmap));
106: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108: PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109: if (rlen) {
110: for (i = 0; i < B->rmap->n; i++) {
111: PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112: PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113: }
114: }
116: B->preallocated = PETSC_TRUE;
118: b = (Mat_SeqSELL *)B->data;
120: if (!b->sliceheight) { /* not set yet */
121: #if defined(PETSC_HAVE_CUPM)
122: b->sliceheight = 16;
123: #else
124: b->sliceheight = 8;
125: #endif
126: }
127: totalslices = PetscCeilInt(B->rmap->n, b->sliceheight);
128: b->totalslices = totalslices;
129: if (!skipallocation) {
130: if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133: PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134: }
135: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137: else if (maxallocrow < 0) maxallocrow = 1;
138: #if defined(PETSC_HAVE_CUPM)
139: rlenmax = maxallocrow;
140: /* Pad the slice to DEVICE_MEM_ALIGN */
141: while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142: #endif
143: for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144: } else {
145: #if defined(PETSC_HAVE_CUPM)
146: PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147: #endif
148: maxallocrow = 0;
149: b->sliidx[0] = 0;
150: for (i = 1; i < totalslices; i++) {
151: b->sliidx[i] = 0;
152: for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]);
153: #if defined(PETSC_HAVE_CUPM)
154: if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155: rlenmax = PetscMax(b->sliidx[i], rlenmax);
156: b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157: }
158: #endif
159: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160: PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161: }
162: /* last slice */
163: b->sliidx[totalslices] = 0;
164: for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165: #if defined(PETSC_HAVE_CUPM)
166: if (mul != 0) {
167: rlenmax = PetscMax(b->sliidx[i], rlenmax);
168: b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169: }
170: #endif
171: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
172: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173: }
175: /* allocate space for val, colidx, rlen */
176: /* FIXME: should B's old memory be unlogged? */
177: PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178: /* FIXME: assuming an element of the bit array takes 8 bits */
179: PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181: PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
183: b->singlemalloc = PETSC_TRUE;
184: b->free_val = PETSC_TRUE;
185: b->free_colidx = PETSC_TRUE;
186: } else {
187: b->free_val = PETSC_FALSE;
188: b->free_colidx = PETSC_FALSE;
189: }
191: b->nz = 0;
192: b->maxallocrow = maxallocrow;
193: #if defined(PETSC_HAVE_CUPM)
194: b->rlenmax = rlenmax;
195: #else
196: b->rlenmax = maxallocrow;
197: #endif
198: b->maxallocmat = b->sliidx[totalslices];
199: B->info.nz_unneeded = (double)b->maxallocmat;
200: if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201: PetscFunctionReturn(PETSC_SUCCESS);
202: }
204: static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205: {
206: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207: PetscInt shift;
209: PetscFunctionBegin;
210: PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211: if (nz) *nz = a->rlen[row];
212: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213: if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
214: if (idx) {
215: PetscInt j;
216: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217: *idx = a->getrowcols;
218: }
219: if (v) {
220: for (PetscInt j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
221: *v = a->getrowvals;
222: }
223: PetscFunctionReturn(PETSC_SUCCESS);
224: }
226: static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
227: {
228: PetscFunctionBegin;
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
233: {
234: Mat B;
235: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
236: PetscInt i;
238: PetscFunctionBegin;
239: if (reuse == MAT_REUSE_MATRIX) {
240: B = *newmat;
241: PetscCall(MatZeroEntries(B));
242: } else {
243: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
244: PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
245: PetscCall(MatSetType(B, MATSEQAIJ));
246: PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
247: }
249: for (i = 0; i < A->rmap->n; i++) {
250: PetscInt nz = 0, *cols = NULL;
251: PetscScalar *vals = NULL;
253: PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
254: PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
255: PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
256: }
258: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
259: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
260: B->rmap->bs = A->rmap->bs;
262: if (reuse == MAT_INPLACE_MATRIX) {
263: PetscCall(MatHeaderReplace(A, &B));
264: } else {
265: *newmat = B;
266: }
267: PetscFunctionReturn(PETSC_SUCCESS);
268: }
270: #include <../src/mat/impls/aij/seq/aij.h>
272: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
273: {
274: Mat B;
275: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
276: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
277: const PetscInt *cols;
278: const PetscScalar *vals;
280: PetscFunctionBegin;
281: if (reuse == MAT_REUSE_MATRIX) {
282: B = *newmat;
283: } else {
284: if (PetscDefined(USE_DEBUG) || !a->ilen) {
285: PetscCall(PetscMalloc1(m, &rowlengths));
286: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287: }
288: if (PetscDefined(USE_DEBUG) && a->ilen) {
289: PetscBool eq;
290: PetscCall(PetscArraycmp(rowlengths, a->ilen, m, &eq));
291: PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292: PetscCall(PetscFree(rowlengths));
293: rowlengths = a->ilen;
294: } else if (a->ilen) rowlengths = a->ilen;
295: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296: PetscCall(MatSetSizes(B, m, n, m, n));
297: PetscCall(MatSetType(B, MATSEQSELL));
298: PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299: if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300: }
302: for (row = 0; row < m; row++) {
303: PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304: PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305: PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306: }
307: PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308: PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309: B->rmap->bs = A->rmap->bs;
311: if (reuse == MAT_INPLACE_MATRIX) {
312: PetscCall(MatHeaderReplace(A, &B));
313: } else {
314: *newmat = B;
315: }
316: PetscFunctionReturn(PETSC_SUCCESS);
317: }
319: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320: {
321: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
322: PetscScalar *y;
323: const PetscScalar *x;
324: const MatScalar *aval = a->val;
325: PetscInt totalslices = a->totalslices;
326: const PetscInt *acolidx = a->colidx;
327: PetscInt i, j;
328: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329: __m512d vec_x, vec_y, vec_vals;
330: __m256i vec_idx;
331: __mmask8 mask;
332: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333: __m256i vec_idx2, vec_idx3, vec_idx4;
334: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335: __m128i vec_idx;
336: __m256d vec_x, vec_y, vec_y2, vec_vals;
337: MatScalar yval;
338: PetscInt r, rows_left, row, nnz_in_row;
339: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340: __m128d vec_x_tmp;
341: __m256d vec_x, vec_y, vec_y2, vec_vals;
342: MatScalar yval;
343: PetscInt r, rows_left, row, nnz_in_row;
344: #else
345: PetscInt k, sliceheight = a->sliceheight;
346: PetscScalar *sum;
347: #endif
349: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350: #pragma disjoint(*x, *y, *aval)
351: #endif
353: PetscFunctionBegin;
354: PetscCall(VecGetArrayRead(xx, &x));
355: PetscCall(VecGetArray(yy, &y));
356: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358: for (i = 0; i < totalslices; i++) { /* loop over slices */
359: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362: vec_y = _mm512_setzero_pd();
363: vec_y2 = _mm512_setzero_pd();
364: vec_y3 = _mm512_setzero_pd();
365: vec_y4 = _mm512_setzero_pd();
367: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369: case 3:
370: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371: acolidx += 8;
372: aval += 8;
373: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374: acolidx += 8;
375: aval += 8;
376: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377: acolidx += 8;
378: aval += 8;
379: j += 3;
380: break;
381: case 2:
382: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383: acolidx += 8;
384: aval += 8;
385: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386: acolidx += 8;
387: aval += 8;
388: j += 2;
389: break;
390: case 1:
391: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392: acolidx += 8;
393: aval += 8;
394: j += 1;
395: break;
396: }
397: #pragma novector
398: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400: acolidx += 8;
401: aval += 8;
402: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403: acolidx += 8;
404: aval += 8;
405: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406: acolidx += 8;
407: aval += 8;
408: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409: acolidx += 8;
410: aval += 8;
411: }
413: vec_y = _mm512_add_pd(vec_y, vec_y2);
414: vec_y = _mm512_add_pd(vec_y, vec_y3);
415: vec_y = _mm512_add_pd(vec_y, vec_y4);
416: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419: } else {
420: _mm512_storeu_pd(&y[8 * i], vec_y);
421: }
422: }
423: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425: for (i = 0; i < totalslices; i++) { /* loop over full slices */
426: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429: /* last slice may have padding rows. Don't use vectorization. */
430: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431: rows_left = A->rmap->n - 8 * i;
432: for (r = 0; r < rows_left; ++r) {
433: yval = (MatScalar)0;
434: row = 8 * i + r;
435: nnz_in_row = a->rlen[row];
436: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437: y[row] = yval;
438: }
439: break;
440: }
442: vec_y = _mm256_setzero_pd();
443: vec_y2 = _mm256_setzero_pd();
445: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446: #pragma novector
447: #pragma unroll(2)
448: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450: aval += 4;
451: acolidx += 4;
452: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453: aval += 4;
454: acolidx += 4;
455: }
457: _mm256_storeu_pd(y + i * 8, vec_y);
458: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459: }
460: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462: for (i = 0; i < totalslices; i++) { /* loop over full slices */
463: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466: vec_y = _mm256_setzero_pd();
467: vec_y2 = _mm256_setzero_pd();
469: /* last slice may have padding rows. Don't use vectorization. */
470: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471: rows_left = A->rmap->n - 8 * i;
472: for (r = 0; r < rows_left; ++r) {
473: yval = (MatScalar)0;
474: row = 8 * i + r;
475: nnz_in_row = a->rlen[row];
476: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477: y[row] = yval;
478: }
479: break;
480: }
482: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483: #pragma novector
484: #pragma unroll(2)
485: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486: vec_vals = _mm256_loadu_pd(aval);
487: vec_x_tmp = _mm_setzero_pd();
488: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495: aval += 4;
497: vec_vals = _mm256_loadu_pd(aval);
498: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505: aval += 4;
506: }
508: _mm256_storeu_pd(y + i * 8, vec_y);
509: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510: }
511: #else
512: PetscCall(PetscMalloc1(sliceheight, &sum));
513: for (i = 0; i < totalslices; i++) { /* loop over slices */
514: for (j = 0; j < sliceheight; j++) {
515: sum[j] = 0.0;
516: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517: }
518: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519: for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520: } else {
521: for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522: }
523: }
524: PetscCall(PetscFree(sum));
525: #endif
527: PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528: PetscCall(VecRestoreArrayRead(xx, &x));
529: PetscCall(VecRestoreArray(yy, &y));
530: PetscFunctionReturn(PETSC_SUCCESS);
531: }
533: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535: {
536: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
537: PetscScalar *y, *z;
538: const PetscScalar *x;
539: const MatScalar *aval = a->val;
540: PetscInt totalslices = a->totalslices;
541: const PetscInt *acolidx = a->colidx;
542: PetscInt i, j;
543: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544: __m512d vec_x, vec_y, vec_vals;
545: __m256i vec_idx;
546: __mmask8 mask = 0;
547: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548: __m256i vec_idx2, vec_idx3, vec_idx4;
549: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550: __m128d vec_x_tmp;
551: __m256d vec_x, vec_y, vec_y2, vec_vals;
552: MatScalar yval;
553: PetscInt r, row, nnz_in_row;
554: #else
555: PetscInt k, sliceheight = a->sliceheight;
556: PetscScalar *sum;
557: #endif
559: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560: #pragma disjoint(*x, *y, *aval)
561: #endif
563: PetscFunctionBegin;
564: if (!a->nz) {
565: PetscCall(VecCopy(yy, zz));
566: PetscFunctionReturn(PETSC_SUCCESS);
567: }
568: PetscCall(VecGetArrayRead(xx, &x));
569: PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572: for (i = 0; i < totalslices; i++) { /* loop over slices */
573: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579: } else {
580: vec_y = _mm512_loadu_pd(&y[8 * i]);
581: }
582: vec_y2 = _mm512_setzero_pd();
583: vec_y3 = _mm512_setzero_pd();
584: vec_y4 = _mm512_setzero_pd();
586: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588: case 3:
589: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590: acolidx += 8;
591: aval += 8;
592: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593: acolidx += 8;
594: aval += 8;
595: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596: acolidx += 8;
597: aval += 8;
598: j += 3;
599: break;
600: case 2:
601: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602: acolidx += 8;
603: aval += 8;
604: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605: acolidx += 8;
606: aval += 8;
607: j += 2;
608: break;
609: case 1:
610: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611: acolidx += 8;
612: aval += 8;
613: j += 1;
614: break;
615: }
616: #pragma novector
617: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619: acolidx += 8;
620: aval += 8;
621: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622: acolidx += 8;
623: aval += 8;
624: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625: acolidx += 8;
626: aval += 8;
627: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628: acolidx += 8;
629: aval += 8;
630: }
632: vec_y = _mm512_add_pd(vec_y, vec_y2);
633: vec_y = _mm512_add_pd(vec_y, vec_y3);
634: vec_y = _mm512_add_pd(vec_y, vec_y4);
635: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637: } else {
638: _mm512_storeu_pd(&z[8 * i], vec_y);
639: }
640: }
641: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642: PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643: for (i = 0; i < totalslices; i++) { /* loop over full slices */
644: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647: /* last slice may have padding rows. Don't use vectorization. */
648: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650: row = 8 * i + r;
651: yval = (MatScalar)0.0;
652: nnz_in_row = a->rlen[row];
653: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654: z[row] = y[row] + yval;
655: }
656: break;
657: }
659: vec_y = _mm256_loadu_pd(y + 8 * i);
660: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664: vec_vals = _mm256_loadu_pd(aval);
665: vec_x_tmp = _mm_setzero_pd();
666: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668: vec_x = _mm256_setzero_pd();
669: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674: aval += 4;
676: vec_vals = _mm256_loadu_pd(aval);
677: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684: aval += 4;
685: }
687: _mm256_storeu_pd(z + i * 8, vec_y);
688: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689: }
690: #else
691: PetscCall(PetscMalloc1(sliceheight, &sum));
692: for (i = 0; i < totalslices; i++) { /* loop over slices */
693: for (j = 0; j < sliceheight; j++) {
694: sum[j] = 0.0;
695: for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696: }
697: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698: for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699: } else {
700: for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701: }
702: }
703: PetscCall(PetscFree(sum));
704: #endif
706: PetscCall(PetscLogFlops(2.0 * a->nz));
707: PetscCall(VecRestoreArrayRead(xx, &x));
708: PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709: PetscFunctionReturn(PETSC_SUCCESS);
710: }
712: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713: {
714: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
715: PetscScalar *y;
716: const PetscScalar *x;
717: const MatScalar *aval = a->val;
718: const PetscInt *acolidx = a->colidx;
719: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722: #pragma disjoint(*x, *y, *aval)
723: #endif
725: PetscFunctionBegin;
726: if (A->symmetric == PETSC_BOOL3_TRUE) {
727: PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728: PetscFunctionReturn(PETSC_SUCCESS);
729: }
730: if (zz != yy) PetscCall(VecCopy(zz, yy));
732: if (a->nz) {
733: PetscCall(VecGetArrayRead(xx, &x));
734: PetscCall(VecGetArray(yy, &y));
735: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736: if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737: for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738: row = sliceheight * i + r;
739: nnz_in_row = a->rlen[row];
740: for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741: }
742: break;
743: }
744: for (r = 0; r < sliceheight; ++r)
745: for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746: }
747: PetscCall(PetscLogFlops(2.0 * a->nz));
748: PetscCall(VecRestoreArrayRead(xx, &x));
749: PetscCall(VecRestoreArray(yy, &y));
750: }
751: PetscFunctionReturn(PETSC_SUCCESS);
752: }
754: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755: {
756: PetscFunctionBegin;
757: if (A->symmetric == PETSC_BOOL3_TRUE) {
758: PetscCall(MatMult_SeqSELL(A, xx, yy));
759: } else {
760: PetscCall(VecSet(yy, 0.0));
761: PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762: }
763: PetscFunctionReturn(PETSC_SUCCESS);
764: }
766: static PetscErrorCode MatGetDiagonalMarkers_SeqSELL(Mat A, const PetscInt **diag, PetscBool *diagDense)
767: {
768: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770: PetscFunctionBegin;
771: if (A->factortype != MAT_FACTOR_NONE) {
772: PetscAssertPointer(diag, 2);
773: PetscCheck(!diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot check for dense diagonal with factored matrices");
774: *diag = a->diag;
775: PetscFunctionReturn(PETSC_SUCCESS);
776: }
777: PetscCheck(diag || diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "At least one of diag or diagDense must be requested");
778: if (a->diagNonzeroState != A->nonzerostate || (diag && !a->diag)) {
779: const PetscInt m = A->rmap->n;
780: PetscInt shift;
782: if (!diag && !a->diag) {
783: a->diagDense = PETSC_TRUE;
784: for (PetscInt i = 0; i < m; i++) {
785: PetscBool found = PETSC_FALSE;
787: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
788: for (PetscInt j = 0; j < a->rlen[i]; j++) {
789: if (a->colidx[shift + a->sliceheight * j] == i) {
790: a->diag[i] = shift + a->sliceheight * j;
791: found = PETSC_TRUE;
792: break;
793: }
794: }
795: if (!found) {
796: a->diagDense = PETSC_FALSE;
797: *diagDense = a->diagDense;
798: a->diagNonzeroState = A->nonzerostate;
799: PetscFunctionReturn(PETSC_SUCCESS);
800: }
801: }
802: } else {
803: if (!a->diag) PetscCall(PetscMalloc1(m, &a->diag));
804: a->diagDense = PETSC_TRUE;
805: for (PetscInt i = 0; i < m; i++) {
806: PetscBool found = PETSC_FALSE;
808: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
809: a->diag[i] = -1;
810: for (PetscInt j = 0; j < a->rlen[i]; j++) {
811: if (a->colidx[shift + a->sliceheight * j] == i) {
812: a->diag[i] = shift + a->sliceheight * j;
813: found = PETSC_TRUE;
814: break;
815: }
816: }
817: if (!found) a->diagDense = PETSC_FALSE;
818: }
819: }
820: a->diagNonzeroState = A->nonzerostate;
821: }
822: if (diag) *diag = a->diag;
823: if (diagDense) *diagDense = a->diagDense;
824: PetscFunctionReturn(PETSC_SUCCESS);
825: }
827: /*
828: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
829: */
830: static PetscErrorCode MatInvertDiagonalForSOR_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
831: {
832: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
833: PetscInt i, m = A->rmap->n;
834: MatScalar *val = a->val;
835: PetscScalar *idiag, *mdiag;
836: const PetscInt *diag;
837: PetscBool diagDense;
839: PetscFunctionBegin;
840: if (a->idiagState == ((PetscObject)A)->state && a->omega == omega && a->fshift == fshift) PetscFunctionReturn(PETSC_SUCCESS);
841: PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, &diagDense));
842: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must have all diagonal locations to invert them");
844: if (!a->idiag) {
845: PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
846: val = a->val;
847: }
848: mdiag = a->mdiag;
849: idiag = a->idiag;
851: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
852: for (i = 0; i < m; i++) {
853: mdiag[i] = val[diag[i]];
854: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
855: PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
856: PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
857: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
858: A->factorerror_zeropivot_value = 0.0;
859: A->factorerror_zeropivot_row = i;
860: }
861: idiag[i] = 1.0 / val[diag[i]];
862: }
863: PetscCall(PetscLogFlops(m));
864: } else {
865: for (i = 0; i < m; i++) {
866: mdiag[i] = val[diag[i]];
867: idiag[i] = omega / (fshift + val[diag[i]]);
868: }
869: PetscCall(PetscLogFlops(2.0 * m));
870: }
871: a->idiagState = ((PetscObject)A)->state;
872: a->omega = omega;
873: a->fshift = fshift;
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
878: {
879: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
881: PetscFunctionBegin;
882: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
883: PetscFunctionReturn(PETSC_SUCCESS);
884: }
886: PetscErrorCode MatDestroy_SeqSELL(Mat A)
887: {
888: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
890: PetscFunctionBegin;
891: PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
892: PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
893: PetscCall(ISDestroy(&a->row));
894: PetscCall(ISDestroy(&a->col));
895: PetscCall(PetscFree(a->diag));
896: PetscCall(PetscFree(a->rlen));
897: PetscCall(PetscFree(a->sliidx));
898: PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
899: PetscCall(PetscFree(a->solve_work));
900: PetscCall(ISDestroy(&a->icol));
901: PetscCall(PetscFree(a->saved_values));
902: PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
903: PetscCall(PetscFree(A->data));
904: #if defined(PETSC_HAVE_CUPM)
905: PetscCall(PetscFree(a->chunk_slice_map));
906: #endif
908: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
909: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
910: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
911: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
912: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
913: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
914: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
915: #if defined(PETSC_HAVE_CUDA)
916: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
917: #endif
918: #if defined(PETSC_HAVE_HIP)
919: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
920: #endif
921: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
922: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
923: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
924: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
925: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
926: PetscFunctionReturn(PETSC_SUCCESS);
927: }
929: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
930: {
931: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
933: PetscFunctionBegin;
934: switch (op) {
935: case MAT_ROW_ORIENTED:
936: a->roworiented = flg;
937: break;
938: case MAT_KEEP_NONZERO_PATTERN:
939: a->keepnonzeropattern = flg;
940: break;
941: case MAT_NEW_NONZERO_LOCATIONS:
942: a->nonew = (flg ? 0 : 1);
943: break;
944: case MAT_NEW_NONZERO_LOCATION_ERR:
945: a->nonew = (flg ? -1 : 0);
946: break;
947: case MAT_NEW_NONZERO_ALLOCATION_ERR:
948: a->nonew = (flg ? -2 : 0);
949: break;
950: case MAT_UNUSED_NONZERO_LOCATION_ERR:
951: a->nounused = (flg ? -1 : 0);
952: break;
953: default:
954: break;
955: }
956: PetscFunctionReturn(PETSC_SUCCESS);
957: }
959: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
960: {
961: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
962: PetscInt i, j, n, shift;
963: PetscScalar *x, zero = 0.0;
965: PetscFunctionBegin;
966: PetscCall(VecGetLocalSize(v, &n));
967: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
969: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
970: const PetscInt *diag;
972: PetscCall(MatGetDiagonalMarkers_SeqSELL(A, &diag, NULL));
973: PetscCall(VecGetArrayWrite(v, &x));
974: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
975: PetscCall(VecRestoreArrayWrite(v, &x));
976: PetscFunctionReturn(PETSC_SUCCESS);
977: }
979: PetscCall(VecSet(v, zero));
980: PetscCall(VecGetArray(v, &x));
981: for (i = 0; i < n; i++) { /* loop over rows */
982: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
983: x[i] = 0;
984: for (j = 0; j < a->rlen[i]; j++) {
985: if (a->colidx[shift + a->sliceheight * j] == i) {
986: x[i] = a->val[shift + a->sliceheight * j];
987: break;
988: }
989: }
990: }
991: PetscCall(VecRestoreArray(v, &x));
992: PetscFunctionReturn(PETSC_SUCCESS);
993: }
995: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
996: {
997: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
998: const PetscScalar *l, *r;
999: PetscInt i, j, m, n, row;
1001: PetscFunctionBegin;
1002: if (ll) {
1003: /* The local size is used so that VecMPI can be passed to this routine
1004: by MatDiagonalScale_MPISELL */
1005: PetscCall(VecGetLocalSize(ll, &m));
1006: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1007: PetscCall(VecGetArrayRead(ll, &l));
1008: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1009: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1010: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1011: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1012: }
1013: } else {
1014: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row];
1015: }
1016: }
1017: PetscCall(VecRestoreArrayRead(ll, &l));
1018: PetscCall(PetscLogFlops(a->nz));
1019: }
1020: if (rr) {
1021: PetscCall(VecGetLocalSize(rr, &n));
1022: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1023: PetscCall(VecGetArrayRead(rr, &r));
1024: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1025: if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1026: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1027: if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1028: }
1029: } else {
1030: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1031: }
1032: }
1033: PetscCall(VecRestoreArrayRead(rr, &r));
1034: PetscCall(PetscLogFlops(a->nz));
1035: }
1036: #if defined(PETSC_HAVE_CUPM)
1037: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1038: #endif
1039: PetscFunctionReturn(PETSC_SUCCESS);
1040: }
1042: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1043: {
1044: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1045: PetscInt *cp, i, k, low, high, t, row, col, l;
1046: PetscInt shift;
1047: MatScalar *vp;
1049: PetscFunctionBegin;
1050: for (k = 0; k < m; k++) { /* loop over requested rows */
1051: row = im[k];
1052: if (row < 0) continue;
1053: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1054: shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1055: cp = a->colidx + shift; /* pointer to the row */
1056: vp = a->val + shift; /* pointer to the row */
1057: for (l = 0; l < n; l++) { /* loop over requested columns */
1058: col = in[l];
1059: if (col < 0) continue;
1060: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1061: high = a->rlen[row];
1062: low = 0; /* assume unsorted */
1063: while (high - low > 5) {
1064: t = (low + high) / 2;
1065: if (*(cp + a->sliceheight * t) > col) high = t;
1066: else low = t;
1067: }
1068: for (i = low; i < high; i++) {
1069: if (*(cp + a->sliceheight * i) > col) break;
1070: if (*(cp + a->sliceheight * i) == col) {
1071: *v++ = *(vp + a->sliceheight * i);
1072: goto finished;
1073: }
1074: }
1075: *v++ = 0.0;
1076: finished:;
1077: }
1078: }
1079: PetscFunctionReturn(PETSC_SUCCESS);
1080: }
1082: static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1083: {
1084: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1085: PetscInt i, j, m = A->rmap->n, shift;
1086: const char *name;
1087: PetscViewerFormat format;
1089: PetscFunctionBegin;
1090: PetscCall(PetscViewerGetFormat(viewer, &format));
1091: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1092: PetscInt nofinalvalue = 0;
1093: /*
1094: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1;
1095: */
1096: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1097: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1098: PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1099: #if defined(PETSC_USE_COMPLEX)
1100: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1101: #else
1102: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1103: #endif
1104: PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1106: for (i = 0; i < m; i++) {
1107: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1108: for (j = 0; j < a->rlen[i]; j++) {
1109: #if defined(PETSC_USE_COMPLEX)
1110: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1111: #else
1112: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1113: #endif
1114: }
1115: }
1116: /*
1117: if (nofinalvalue) {
1118: #if defined(PETSC_USE_COMPLEX)
1119: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1120: #else
1121: PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0));
1122: #endif
1123: }
1124: */
1125: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1126: PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1127: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1128: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1129: PetscFunctionReturn(PETSC_SUCCESS);
1130: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1131: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1132: for (i = 0; i < m; i++) {
1133: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1134: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1135: for (j = 0; j < a->rlen[i]; j++) {
1136: #if defined(PETSC_USE_COMPLEX)
1137: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1138: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1139: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1140: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1141: } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1142: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1143: }
1144: #else
1145: if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1146: #endif
1147: }
1148: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1149: }
1150: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1151: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1152: PetscInt cnt = 0, jcnt;
1153: PetscScalar value;
1154: #if defined(PETSC_USE_COMPLEX)
1155: PetscBool realonly = PETSC_TRUE;
1156: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1157: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1158: realonly = PETSC_FALSE;
1159: break;
1160: }
1161: }
1162: #endif
1164: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1165: for (i = 0; i < m; i++) {
1166: jcnt = 0;
1167: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1168: for (j = 0; j < A->cmap->n; j++) {
1169: if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1170: value = a->val[cnt++];
1171: jcnt++;
1172: } else {
1173: value = 0.0;
1174: }
1175: #if defined(PETSC_USE_COMPLEX)
1176: if (realonly) {
1177: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1178: } else {
1179: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1180: }
1181: #else
1182: PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1183: #endif
1184: }
1185: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1186: }
1187: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1188: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1189: PetscInt fshift = 1;
1190: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1191: #if defined(PETSC_USE_COMPLEX)
1192: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1193: #else
1194: PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1195: #endif
1196: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1197: for (i = 0; i < m; i++) {
1198: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1199: for (j = 0; j < a->rlen[i]; j++) {
1200: #if defined(PETSC_USE_COMPLEX)
1201: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1202: #else
1203: PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1204: #endif
1205: }
1206: }
1207: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1208: } else if (format == PETSC_VIEWER_NATIVE) {
1209: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1210: PetscInt row;
1211: PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1212: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1213: #if defined(PETSC_USE_COMPLEX)
1214: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1215: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1216: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1217: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1218: } else {
1219: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1220: }
1221: #else
1222: PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1223: #endif
1224: }
1225: }
1226: } else {
1227: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1228: if (A->factortype) {
1229: for (i = 0; i < m; i++) {
1230: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1231: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1232: /* L part */
1233: for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1234: #if defined(PETSC_USE_COMPLEX)
1235: if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1236: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1237: } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1238: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1239: } else {
1240: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1241: }
1242: #else
1243: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1244: #endif
1245: }
1246: /* diagonal */
1247: j = a->diag[i];
1248: #if defined(PETSC_USE_COMPLEX)
1249: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1250: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1251: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1252: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1253: } else {
1254: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1255: }
1256: #else
1257: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1258: #endif
1260: /* U part */
1261: for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1262: #if defined(PETSC_USE_COMPLEX)
1263: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1264: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1265: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1266: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1267: } else {
1268: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1269: }
1270: #else
1271: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1272: #endif
1273: }
1274: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1275: }
1276: } else {
1277: for (i = 0; i < m; i++) {
1278: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1279: PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1280: for (j = 0; j < a->rlen[i]; j++) {
1281: #if defined(PETSC_USE_COMPLEX)
1282: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1283: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1284: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1285: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1286: } else {
1287: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1288: }
1289: #else
1290: PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1291: #endif
1292: }
1293: PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1294: }
1295: }
1296: PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1297: }
1298: PetscCall(PetscViewerFlush(viewer));
1299: PetscFunctionReturn(PETSC_SUCCESS);
1300: }
1302: #include <petscdraw.h>
1303: static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1304: {
1305: Mat A = (Mat)Aa;
1306: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1307: PetscInt i, j, m = A->rmap->n, shift;
1308: int color;
1309: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1310: PetscViewer viewer;
1311: PetscViewerFormat format;
1313: PetscFunctionBegin;
1314: PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1315: PetscCall(PetscViewerGetFormat(viewer, &format));
1316: PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1318: /* loop over matrix elements drawing boxes */
1320: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1321: PetscDrawCollectiveBegin(draw);
1322: /* Blue for negative, Cyan for zero and Red for positive */
1323: color = PETSC_DRAW_BLUE;
1324: for (i = 0; i < m; i++) {
1325: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1326: y_l = m - i - 1.0;
1327: y_r = y_l + 1.0;
1328: for (j = 0; j < a->rlen[i]; j++) {
1329: x_l = a->colidx[shift + a->sliceheight * j];
1330: x_r = x_l + 1.0;
1331: if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1332: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1333: }
1334: }
1335: color = PETSC_DRAW_CYAN;
1336: for (i = 0; i < m; i++) {
1337: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1338: y_l = m - i - 1.0;
1339: y_r = y_l + 1.0;
1340: for (j = 0; j < a->rlen[i]; j++) {
1341: x_l = a->colidx[shift + a->sliceheight * j];
1342: x_r = x_l + 1.0;
1343: if (a->val[shift + a->sliceheight * j] != 0.) continue;
1344: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1345: }
1346: }
1347: color = PETSC_DRAW_RED;
1348: for (i = 0; i < m; i++) {
1349: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1350: y_l = m - i - 1.0;
1351: y_r = y_l + 1.0;
1352: for (j = 0; j < a->rlen[i]; j++) {
1353: x_l = a->colidx[shift + a->sliceheight * j];
1354: x_r = x_l + 1.0;
1355: if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1356: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1357: }
1358: }
1359: PetscDrawCollectiveEnd(draw);
1360: } else {
1361: /* use contour shading to indicate magnitude of values */
1362: /* first determine max of all nonzero values */
1363: PetscReal minv = 0.0, maxv = 0.0;
1364: PetscInt count = 0;
1365: PetscDraw popup;
1366: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1367: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1368: }
1369: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1370: PetscCall(PetscDrawGetPopup(draw, &popup));
1371: PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1373: PetscDrawCollectiveBegin(draw);
1374: for (i = 0; i < m; i++) {
1375: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1376: y_l = m - i - 1.0;
1377: y_r = y_l + 1.0;
1378: for (j = 0; j < a->rlen[i]; j++) {
1379: x_l = a->colidx[shift + a->sliceheight * j];
1380: x_r = x_l + 1.0;
1381: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1382: PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1383: count++;
1384: }
1385: }
1386: PetscDrawCollectiveEnd(draw);
1387: }
1388: PetscFunctionReturn(PETSC_SUCCESS);
1389: }
1391: #include <petscdraw.h>
1392: static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1393: {
1394: PetscDraw draw;
1395: PetscReal xr, yr, xl, yl, h, w;
1396: PetscBool isnull;
1398: PetscFunctionBegin;
1399: PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1400: PetscCall(PetscDrawIsNull(draw, &isnull));
1401: if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1403: xr = A->cmap->n;
1404: yr = A->rmap->n;
1405: h = yr / 10.0;
1406: w = xr / 10.0;
1407: xr += w;
1408: yr += h;
1409: xl = -w;
1410: yl = -h;
1411: PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1412: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1413: PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1414: PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1415: PetscCall(PetscDrawSave(draw));
1416: PetscFunctionReturn(PETSC_SUCCESS);
1417: }
1419: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1420: {
1421: PetscBool isascii, isbinary, isdraw;
1423: PetscFunctionBegin;
1424: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1425: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1426: PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1427: if (isascii) {
1428: PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1429: } else if (isbinary) {
1430: /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1431: } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1432: PetscFunctionReturn(PETSC_SUCCESS);
1433: }
1435: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1436: {
1437: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1438: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1439: MatScalar *vp;
1440: #if defined(PETSC_HAVE_CUPM)
1441: PetscInt totalchunks = 0;
1442: #endif
1444: PetscFunctionBegin;
1445: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1446: /* To do: compress out the unused elements */
1447: PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1448: PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1449: PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1450: a->nonzerorowcnt = 0;
1451: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1452: for (i = 0; i < a->totalslices; ++i) {
1453: shift = a->sliidx[i]; /* starting index of the slice */
1454: cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */
1455: vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */
1456: for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1457: row = a->sliceheight * i + row_in_slice;
1458: nrow = a->rlen[row]; /* number of nonzeros in row */
1459: /*
1460: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1461: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1462: */
1463: lastcol = 0;
1464: if (nrow > 0) { /* nonempty row */
1465: a->nonzerorowcnt++;
1466: lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1467: } else if (!row_in_slice) { /* first row of the correct slice is empty */
1468: for (j = 1; j < a->sliceheight; j++) {
1469: if (a->rlen[a->sliceheight * i + j]) {
1470: lastcol = cp[j];
1471: break;
1472: }
1473: }
1474: } else {
1475: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1476: }
1478: for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1479: cp[a->sliceheight * k + row_in_slice] = lastcol;
1480: vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1481: }
1482: }
1483: }
1485: A->info.mallocs += a->reallocs;
1486: a->reallocs = 0;
1488: #if defined(PETSC_HAVE_CUPM)
1489: if (!a->chunksize && a->totalslices) {
1490: a->chunksize = 64;
1491: while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1492: totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1493: }
1494: if (totalchunks != a->totalchunks) {
1495: PetscCall(PetscFree(a->chunk_slice_map));
1496: PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1497: a->totalchunks = totalchunks;
1498: }
1499: j = 0;
1500: for (i = 0; i < totalchunks; i++) {
1501: while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1502: a->chunk_slice_map[i] = j;
1503: }
1504: #endif
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1509: {
1510: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1512: PetscFunctionBegin;
1513: info->block_size = 1.0;
1514: info->nz_allocated = a->maxallocmat;
1515: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1516: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1517: info->assemblies = A->num_ass;
1518: info->mallocs = A->info.mallocs;
1519: info->memory = 0; /* REVIEW ME */
1520: if (A->factortype) {
1521: info->fill_ratio_given = A->info.fill_ratio_given;
1522: info->fill_ratio_needed = A->info.fill_ratio_needed;
1523: info->factor_mallocs = A->info.factor_mallocs;
1524: } else {
1525: info->fill_ratio_given = 0;
1526: info->fill_ratio_needed = 0;
1527: info->factor_mallocs = 0;
1528: }
1529: PetscFunctionReturn(PETSC_SUCCESS);
1530: }
1532: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1533: {
1534: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1535: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1536: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1537: MatScalar *vp, value;
1538: #if defined(PETSC_HAVE_CUPM)
1539: PetscBool inserted = PETSC_FALSE;
1540: PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight;
1541: #endif
1543: PetscFunctionBegin;
1544: for (k = 0; k < m; k++) { /* loop over added rows */
1545: row = im[k];
1546: if (row < 0) continue;
1547: PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1548: shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1549: cp = a->colidx + shift; /* pointer to the row */
1550: vp = a->val + shift; /* pointer to the row */
1551: nrow = a->rlen[row];
1552: low = 0;
1553: high = nrow;
1555: for (l = 0; l < n; l++) { /* loop over added columns */
1556: col = in[l];
1557: if (col < 0) continue;
1558: PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1559: if (a->roworiented) {
1560: value = v[l + k * n];
1561: } else {
1562: value = v[k + l * m];
1563: }
1564: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1566: /* search in this row for the specified column, i indicates the column to be set */
1567: if (col <= lastcol) low = 0;
1568: else high = nrow;
1569: lastcol = col;
1570: while (high - low > 5) {
1571: t = (low + high) / 2;
1572: if (*(cp + a->sliceheight * t) > col) high = t;
1573: else low = t;
1574: }
1575: for (i = low; i < high; i++) {
1576: if (*(cp + a->sliceheight * i) > col) break;
1577: if (*(cp + a->sliceheight * i) == col) {
1578: if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1579: else *(vp + a->sliceheight * i) = value;
1580: #if defined(PETSC_HAVE_CUPM)
1581: inserted = PETSC_TRUE;
1582: #endif
1583: low = i + 1;
1584: goto noinsert;
1585: }
1586: }
1587: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1588: if (nonew == 1) goto noinsert;
1589: PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1590: #if defined(PETSC_HAVE_CUPM)
1591: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1592: #else
1593: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1594: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1595: #endif
1596: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1597: for (ii = nrow - 1; ii >= i; ii--) {
1598: *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1599: *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1600: }
1601: a->rlen[row]++;
1602: *(cp + a->sliceheight * i) = col;
1603: *(vp + a->sliceheight * i) = value;
1604: a->nz++;
1605: #if defined(PETSC_HAVE_CUPM)
1606: inserted = PETSC_TRUE;
1607: #endif
1608: low = i + 1;
1609: high++;
1610: nrow++;
1611: noinsert:;
1612: }
1613: a->rlen[row] = nrow;
1614: }
1615: #if defined(PETSC_HAVE_CUPM)
1616: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1617: #endif
1618: PetscFunctionReturn(PETSC_SUCCESS);
1619: }
1621: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1622: {
1623: PetscFunctionBegin;
1624: /* If the two matrices have the same copy implementation, use fast copy. */
1625: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1626: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1627: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1629: PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1630: PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1631: } else {
1632: PetscCall(MatCopy_Basic(A, B, str));
1633: }
1634: PetscFunctionReturn(PETSC_SUCCESS);
1635: }
1637: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1638: {
1639: PetscFunctionBegin;
1640: PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1641: PetscFunctionReturn(PETSC_SUCCESS);
1642: }
1644: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1645: {
1646: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1648: PetscFunctionBegin;
1649: *array = a->val;
1650: PetscFunctionReturn(PETSC_SUCCESS);
1651: }
1653: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1654: {
1655: PetscFunctionBegin;
1656: PetscFunctionReturn(PETSC_SUCCESS);
1657: }
1659: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1660: {
1661: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1662: MatScalar *aval = a->val;
1663: PetscScalar oalpha = alpha;
1664: PetscBLASInt one = 1, size;
1666: PetscFunctionBegin;
1667: PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1668: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1669: PetscCall(PetscLogFlops(a->nz));
1670: #if defined(PETSC_HAVE_CUPM)
1671: if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1672: #endif
1673: PetscFunctionReturn(PETSC_SUCCESS);
1674: }
1676: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1677: {
1678: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1680: PetscFunctionBegin;
1681: if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1682: PetscCall(MatShift_Basic(Y, a));
1683: PetscFunctionReturn(PETSC_SUCCESS);
1684: }
1686: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1687: {
1688: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1689: PetscScalar *x, sum, *t;
1690: const MatScalar *idiag = NULL, *mdiag;
1691: const PetscScalar *b, *xb;
1692: PetscInt n, m = A->rmap->n, i, j, shift;
1693: const PetscInt *diag;
1695: PetscFunctionBegin;
1696: its = its * lits;
1698: PetscCall(MatInvertDiagonalForSOR_SeqSELL(A, omega, fshift));
1699: diag = a->diag;
1700: t = a->ssor_work;
1701: idiag = a->idiag;
1702: mdiag = a->mdiag;
1704: PetscCall(VecGetArray(xx, &x));
1705: PetscCall(VecGetArrayRead(bb, &b));
1706: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1707: PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1708: PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1709: PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1711: if (flag & SOR_ZERO_INITIAL_GUESS) {
1712: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1713: for (i = 0; i < m; i++) {
1714: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1715: sum = b[i];
1716: n = (diag[i] - shift) / a->sliceheight;
1717: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1718: t[i] = sum;
1719: x[i] = sum * idiag[i];
1720: }
1721: xb = t;
1722: PetscCall(PetscLogFlops(a->nz));
1723: } else xb = b;
1724: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1725: for (i = m - 1; i >= 0; i--) {
1726: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1727: sum = xb[i];
1728: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1729: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1730: if (xb == b) {
1731: x[i] = sum * idiag[i];
1732: } else {
1733: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1734: }
1735: }
1736: PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1737: }
1738: its--;
1739: }
1740: while (its--) {
1741: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1742: for (i = 0; i < m; i++) {
1743: /* lower */
1744: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1745: sum = b[i];
1746: n = (diag[i] - shift) / a->sliceheight;
1747: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1748: t[i] = sum; /* save application of the lower-triangular part */
1749: /* upper */
1750: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1751: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1752: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1753: }
1754: xb = t;
1755: PetscCall(PetscLogFlops(2.0 * a->nz));
1756: } else xb = b;
1757: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1758: for (i = m - 1; i >= 0; i--) {
1759: shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1760: sum = xb[i];
1761: if (xb == b) {
1762: /* whole matrix (no checkpointing available) */
1763: n = a->rlen[i];
1764: for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1765: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1766: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1767: n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1768: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1769: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1770: }
1771: }
1772: if (xb == b) PetscCall(PetscLogFlops(2.0 * a->nz));
1773: else PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1774: }
1775: }
1776: PetscCall(VecRestoreArray(xx, &x));
1777: PetscCall(VecRestoreArrayRead(bb, &b));
1778: PetscFunctionReturn(PETSC_SUCCESS);
1779: }
1781: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1782: MatGetRow_SeqSELL,
1783: MatRestoreRow_SeqSELL,
1784: MatMult_SeqSELL,
1785: /* 4*/ MatMultAdd_SeqSELL,
1786: MatMultTranspose_SeqSELL,
1787: MatMultTransposeAdd_SeqSELL,
1788: NULL,
1789: NULL,
1790: NULL,
1791: /* 10*/ NULL,
1792: NULL,
1793: NULL,
1794: MatSOR_SeqSELL,
1795: NULL,
1796: /* 15*/ MatGetInfo_SeqSELL,
1797: MatEqual_SeqSELL,
1798: MatGetDiagonal_SeqSELL,
1799: MatDiagonalScale_SeqSELL,
1800: NULL,
1801: /* 20*/ NULL,
1802: MatAssemblyEnd_SeqSELL,
1803: MatSetOption_SeqSELL,
1804: MatZeroEntries_SeqSELL,
1805: /* 24*/ NULL,
1806: NULL,
1807: NULL,
1808: NULL,
1809: NULL,
1810: /* 29*/ MatSetUp_SeqSELL,
1811: NULL,
1812: NULL,
1813: NULL,
1814: NULL,
1815: /* 34*/ MatDuplicate_SeqSELL,
1816: NULL,
1817: NULL,
1818: NULL,
1819: NULL,
1820: /* 39*/ NULL,
1821: NULL,
1822: NULL,
1823: MatGetValues_SeqSELL,
1824: MatCopy_SeqSELL,
1825: /* 44*/ NULL,
1826: MatScale_SeqSELL,
1827: MatShift_SeqSELL,
1828: NULL,
1829: NULL,
1830: /* 49*/ NULL,
1831: NULL,
1832: NULL,
1833: NULL,
1834: NULL,
1835: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1836: NULL,
1837: NULL,
1838: NULL,
1839: NULL,
1840: /* 59*/ NULL,
1841: MatDestroy_SeqSELL,
1842: MatView_SeqSELL,
1843: NULL,
1844: NULL,
1845: /* 64*/ NULL,
1846: NULL,
1847: NULL,
1848: NULL,
1849: NULL,
1850: /* 69*/ NULL,
1851: NULL,
1852: NULL,
1853: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1854: NULL,
1855: /* 74*/ NULL,
1856: NULL,
1857: NULL,
1858: NULL,
1859: NULL,
1860: /* 79*/ NULL,
1861: NULL,
1862: NULL,
1863: NULL,
1864: NULL,
1865: /* 84*/ NULL,
1866: NULL,
1867: NULL,
1868: NULL,
1869: NULL,
1870: /* 89*/ NULL,
1871: NULL,
1872: NULL,
1873: NULL,
1874: MatConjugate_SeqSELL,
1875: /* 94*/ NULL,
1876: NULL,
1877: NULL,
1878: NULL,
1879: NULL,
1880: /* 99*/ NULL,
1881: NULL,
1882: NULL,
1883: NULL,
1884: NULL,
1885: /*104*/ NULL,
1886: NULL,
1887: NULL,
1888: NULL,
1889: NULL,
1890: /*109*/ NULL,
1891: NULL,
1892: NULL,
1893: NULL,
1894: NULL,
1895: /*114*/ NULL,
1896: NULL,
1897: NULL,
1898: NULL,
1899: NULL,
1900: /*119*/ NULL,
1901: NULL,
1902: NULL,
1903: NULL,
1904: NULL,
1905: /*124*/ NULL,
1906: NULL,
1907: NULL,
1908: MatFDColoringSetUp_SeqXAIJ,
1909: NULL,
1910: /*129*/ NULL,
1911: NULL,
1912: NULL,
1913: NULL,
1914: NULL,
1915: /*134*/ NULL,
1916: NULL,
1917: NULL,
1918: NULL,
1919: NULL,
1920: /*139*/ NULL,
1921: NULL,
1922: NULL,
1923: NULL,
1924: MatADot_Default,
1925: /*144*/ MatANorm_Default,
1926: NULL,
1927: NULL};
1929: static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1930: {
1931: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1933: PetscFunctionBegin;
1934: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1936: /* allocate space for values if not already there */
1937: if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1939: /* copy values over */
1940: PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1941: PetscFunctionReturn(PETSC_SUCCESS);
1942: }
1944: static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1945: {
1946: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1948: PetscFunctionBegin;
1949: PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1950: PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1951: PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1952: PetscFunctionReturn(PETSC_SUCCESS);
1953: }
1955: static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1956: {
1957: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1959: PetscFunctionBegin;
1960: if (a->totalslices && a->sliidx[a->totalslices]) {
1961: *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1962: } else {
1963: *ratio = 0.0;
1964: }
1965: PetscFunctionReturn(PETSC_SUCCESS);
1966: }
1968: static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1969: {
1970: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1971: PetscInt i, current_slicewidth;
1973: PetscFunctionBegin;
1974: *slicewidth = 0;
1975: for (i = 0; i < a->totalslices; i++) {
1976: current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1977: if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1978: }
1979: PetscFunctionReturn(PETSC_SUCCESS);
1980: }
1982: static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1983: {
1984: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1986: PetscFunctionBegin;
1987: *slicewidth = 0;
1988: if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices;
1989: PetscFunctionReturn(PETSC_SUCCESS);
1990: }
1992: static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
1993: {
1994: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1995: PetscReal mean;
1996: PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx;
1998: PetscFunctionBegin;
1999: *variance = 0;
2000: if (totalslices) {
2001: mean = (PetscReal)sliidx[totalslices] / totalslices;
2002: for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices;
2003: }
2004: PetscFunctionReturn(PETSC_SUCCESS);
2005: }
2007: static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2008: {
2009: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2011: PetscFunctionBegin;
2012: if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2013: PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2014: a->sliceheight = sliceheight;
2015: #if defined(PETSC_HAVE_CUPM)
2016: PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2017: #endif
2018: PetscFunctionReturn(PETSC_SUCCESS);
2019: }
2021: /*@
2022: MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2024: Not Collective
2026: Input Parameter:
2027: . A - a MATSEQSELL matrix
2029: Output Parameter:
2030: . ratio - ratio of number of padded zeros to number of allocated elements
2032: Level: intermediate
2034: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2035: @*/
2036: PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2037: {
2038: PetscFunctionBegin;
2039: PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2040: PetscFunctionReturn(PETSC_SUCCESS);
2041: }
2043: /*@
2044: MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2046: Not Collective
2048: Input Parameter:
2049: . A - a MATSEQSELL matrix
2051: Output Parameter:
2052: . slicewidth - maximum slice width
2054: Level: intermediate
2056: .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2057: @*/
2058: PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2059: {
2060: PetscFunctionBegin;
2061: PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2062: PetscFunctionReturn(PETSC_SUCCESS);
2063: }
2065: /*@
2066: MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2068: Not Collective
2070: Input Parameter:
2071: . A - a MATSEQSELL matrix
2073: Output Parameter:
2074: . slicewidth - average slice width
2076: Level: intermediate
2078: .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2079: @*/
2080: PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2081: {
2082: PetscFunctionBegin;
2083: PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2084: PetscFunctionReturn(PETSC_SUCCESS);
2085: }
2087: /*@
2088: MatSeqSELLSetSliceHeight - sets the slice height.
2090: Not Collective
2092: Input Parameters:
2093: + A - a MATSEQSELL matrix
2094: - sliceheight - slice height
2096: Notes:
2097: You cannot change the slice height once it have been set.
2099: The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2101: Level: intermediate
2103: .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2104: @*/
2105: PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2106: {
2107: PetscFunctionBegin;
2108: PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2109: PetscFunctionReturn(PETSC_SUCCESS);
2110: }
2112: /*@
2113: MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2115: Not Collective
2117: Input Parameter:
2118: . A - a MATSEQSELL matrix
2120: Output Parameter:
2121: . variance - variance of the slice size
2123: Level: intermediate
2125: .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2126: @*/
2127: PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2128: {
2129: PetscFunctionBegin;
2130: PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2131: PetscFunctionReturn(PETSC_SUCCESS);
2132: }
2134: #if defined(PETSC_HAVE_CUDA)
2135: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2136: #endif
2137: #if defined(PETSC_HAVE_HIP)
2138: PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2139: #endif
2141: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2142: {
2143: Mat_SeqSELL *b;
2144: PetscMPIInt size;
2146: PetscFunctionBegin;
2147: PetscCall(PetscCitationsRegister(citation, &cited));
2148: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2149: PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2151: PetscCall(PetscNew(&b));
2153: B->data = (void *)b;
2154: B->ops[0] = MatOps_Values;
2156: b->row = NULL;
2157: b->col = NULL;
2158: b->icol = NULL;
2159: b->reallocs = 0;
2160: b->ignorezeroentries = PETSC_FALSE;
2161: b->roworiented = PETSC_TRUE;
2162: b->nonew = 0;
2163: b->diag = NULL;
2164: b->solve_work = NULL;
2165: B->spptr = NULL;
2166: b->saved_values = NULL;
2167: b->idiag = NULL;
2168: b->mdiag = NULL;
2169: b->ssor_work = NULL;
2170: b->omega = 1.0;
2171: b->fshift = 0.0;
2172: b->keepnonzeropattern = PETSC_FALSE;
2173: b->sliceheight = 0;
2175: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2176: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2177: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2178: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2179: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2180: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2181: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2182: #if defined(PETSC_HAVE_CUDA)
2183: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2184: #endif
2185: #if defined(PETSC_HAVE_HIP)
2186: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2187: #endif
2188: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2189: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2190: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2191: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2192: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2194: PetscObjectOptionsBegin((PetscObject)B);
2195: {
2196: PetscInt newsh = -1;
2197: PetscBool flg;
2198: #if defined(PETSC_HAVE_CUPM)
2199: PetscInt chunksize = 0;
2200: #endif
2202: PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2203: if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh));
2204: #if defined(PETSC_HAVE_CUPM)
2205: PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2206: if (flg) {
2207: PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2208: b->chunksize = chunksize;
2209: }
2210: #endif
2211: }
2212: PetscOptionsEnd();
2213: PetscFunctionReturn(PETSC_SUCCESS);
2214: }
2216: /*
2217: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2218: */
2219: static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2220: {
2221: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2222: PetscInt i, m = A->rmap->n;
2223: PetscInt totalslices = a->totalslices;
2225: PetscFunctionBegin;
2226: C->factortype = A->factortype;
2227: c->row = NULL;
2228: c->col = NULL;
2229: c->icol = NULL;
2230: c->reallocs = 0;
2231: C->assembled = PETSC_TRUE;
2233: PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2234: PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2236: c->sliceheight = a->sliceheight;
2237: PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2238: PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2240: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2241: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2243: /* allocate the matrix space */
2244: if (mallocmatspace) {
2245: PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2247: c->singlemalloc = PETSC_TRUE;
2249: if (m > 0) {
2250: PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2251: if (cpvalues == MAT_COPY_VALUES) {
2252: PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2253: } else {
2254: PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2255: }
2256: }
2257: }
2259: c->ignorezeroentries = a->ignorezeroentries;
2260: c->roworiented = a->roworiented;
2261: c->nonew = a->nonew;
2262: c->solve_work = NULL;
2263: c->saved_values = NULL;
2264: c->idiag = NULL;
2265: c->ssor_work = NULL;
2266: c->keepnonzeropattern = a->keepnonzeropattern;
2267: c->free_val = PETSC_TRUE;
2268: c->free_colidx = PETSC_TRUE;
2270: c->maxallocmat = a->maxallocmat;
2271: c->maxallocrow = a->maxallocrow;
2272: c->rlenmax = a->rlenmax;
2273: c->nz = a->nz;
2274: C->preallocated = PETSC_TRUE;
2276: c->nonzerorowcnt = a->nonzerorowcnt;
2277: C->nonzerostate = A->nonzerostate;
2279: PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2280: PetscFunctionReturn(PETSC_SUCCESS);
2281: }
2283: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2284: {
2285: PetscFunctionBegin;
2286: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2287: PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2288: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2289: PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2290: PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2291: PetscFunctionReturn(PETSC_SUCCESS);
2292: }
2294: /*MC
2295: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2296: based on the sliced Ellpack format, {cite}`zhangellpack2018`
2298: Options Database Key:
2299: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2301: Level: beginner
2303: .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2304: M*/
2306: /*MC
2307: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2309: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2310: and `MATMPISELL` otherwise. As a result, for single process communicators,
2311: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2312: for communicators controlling multiple processes. It is recommended that you call both of
2313: the above preallocation routines for simplicity.
2315: Options Database Key:
2316: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2318: Level: beginner
2320: Notes:
2321: This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2323: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2324: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2326: Developer Notes:
2327: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2329: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2330: .vb
2331: (2 0 3 4)
2332: Consider the matrix A = (5 0 6 0)
2333: (0 0 7 8)
2334: (0 0 9 9)
2336: symbolically the Ellpack format can be written as
2338: (2 3 4 |) (0 2 3 |)
2339: v = (5 6 0 |) colidx = (0 2 2 |)
2340: -------- ---------
2341: (7 8 |) (2 3 |)
2342: (9 9 |) (2 3 |)
2344: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2345: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2346: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2348: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2350: .ve
2352: See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2354: .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2355: M*/
2357: /*@
2358: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2360: Collective
2362: Input Parameters:
2363: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2364: . m - number of rows
2365: . n - number of columns
2366: . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2367: - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2369: Output Parameter:
2370: . A - the matrix
2372: Level: intermediate
2374: Notes:
2375: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2376: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2377: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2379: Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2380: Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2381: allocation.
2383: .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2384: @*/
2385: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2386: {
2387: PetscFunctionBegin;
2388: PetscCall(MatCreate(comm, A));
2389: PetscCall(MatSetSizes(*A, m, n, m, n));
2390: PetscCall(MatSetType(*A, MATSEQSELL));
2391: PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2392: PetscFunctionReturn(PETSC_SUCCESS);
2393: }
2395: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2396: {
2397: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2398: PetscInt totalslices = a->totalslices;
2400: PetscFunctionBegin;
2401: /* If the matrix dimensions are not equal,or no of nonzeros */
2402: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2403: *flg = PETSC_FALSE;
2404: PetscFunctionReturn(PETSC_SUCCESS);
2405: }
2406: /* if the a->colidx are the same */
2407: PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2408: if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2409: /* if a->val are the same */
2410: PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2411: PetscFunctionReturn(PETSC_SUCCESS);
2412: }
2414: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2415: {
2416: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2417: PetscScalar *val = a->val;
2419: PetscFunctionBegin;
2420: for (PetscInt i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2421: #if defined(PETSC_HAVE_CUPM)
2422: if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2423: #endif
2424: PetscFunctionReturn(PETSC_SUCCESS);
2425: }