Actual source code: sell.c
2: /*
3: Defines the basic matrix operations for the SELL matrix storage format.
4: */
5: #include <../src/mat/impls/sell/seq/sell.h>
6: #include <petscblaslapack.h>
7: #include <petsc/private/kernels/blocktranspose.h>
9: static PetscBool cited = PETSC_FALSE;
10: static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11: " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12: " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13: " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14: " year = 2018\n"
15: "}\n";
17: #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19: #include <immintrin.h>
21: #if !defined(_MM_SCALE_8)
22: #define _MM_SCALE_8 8
23: #endif
25: #if defined(__AVX512F__)
26: /* these do not work
27: vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28: vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29: */
30: #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31: /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32: vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \
33: vec_vals = _mm512_loadu_pd(aval); \
34: vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35: vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36: #elif defined(__AVX2__) && defined(__FMA__)
37: #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38: vec_vals = _mm256_loadu_pd(aval); \
39: vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40: vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41: vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42: #endif
43: #endif /* PETSC_HAVE_IMMINTRIN_H */
45: /*@C
46: MatSeqSELLSetPreallocation - For good matrix assembly performance
47: the user should preallocate the matrix storage by setting the parameter nz
48: (or the array nnz). By setting these parameters accurately, performance
49: during matrix assembly can be increased significantly.
51: Collective
53: Input Parameters:
54: + B - The `MATSEQSELL` matrix
55: . nz - number of nonzeros per row (same for all rows)
56: - nnz - array containing the number of nonzeros in the various rows
57: (possibly different for each row) or NULL
59: Notes:
60: If nnz is given then nz is ignored.
62: Specify the preallocated storage with either nz or nnz (not both).
63: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
64: allocation. For large problems you MUST preallocate memory or you
65: will get TERRIBLE performance, see the users' manual chapter on matrices.
67: You can call `MatGetInfo()` to get information on how effective the preallocation was;
68: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69: You can also run with the option -info and look for messages with the string
70: malloc in them to see if additional memory allocation was needed.
72: Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
73: entries or columns indices.
75: The maximum number of nonzeos in any row should be as accurate as possible.
76: If it is underestimated, you will get bad performance due to reallocation
77: (MatSeqXSELLReallocateSELL).
79: Level: intermediate
81: .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82: @*/
83: PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
84: {
87: PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
88: return 0;
89: }
91: PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
92: {
93: Mat_SeqSELL *b;
94: PetscInt i, j, totalslices;
95: PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
97: if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98: if (maxallocrow == MAT_SKIP_ALLOCATION) {
99: skipallocation = PETSC_TRUE;
100: maxallocrow = 0;
101: }
103: PetscLayoutSetUp(B->rmap);
104: PetscLayoutSetUp(B->cmap);
106: /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109: if (rlen) {
110: for (i = 0; i < B->rmap->n; i++) {
113: }
114: }
116: B->preallocated = PETSC_TRUE;
118: b = (Mat_SeqSELL *)B->data;
120: totalslices = PetscCeilInt(B->rmap->n, 8);
121: b->totalslices = totalslices;
122: if (!skipallocation) {
123: if (B->rmap->n & 0x07) PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n);
125: if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
126: PetscMalloc1(totalslices + 1, &b->sliidx);
127: }
128: if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
129: if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
130: else if (maxallocrow < 0) maxallocrow = 1;
131: for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
132: } else {
133: maxallocrow = 0;
134: b->sliidx[0] = 0;
135: for (i = 1; i < totalslices; i++) {
136: b->sliidx[i] = 0;
137: for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
138: maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
139: PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]);
140: }
141: /* last slice */
142: b->sliidx[totalslices] = 0;
143: for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
144: maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow);
145: b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
146: }
148: /* allocate space for val, colidx, rlen */
149: /* FIXME: should B's old memory be unlogged? */
150: MatSeqXSELLFreeSELL(B, &b->val, &b->colidx);
151: /* FIXME: assuming an element of the bit array takes 8 bits */
152: PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx);
153: /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
154: PetscCalloc1(8 * totalslices, &b->rlen);
156: b->singlemalloc = PETSC_TRUE;
157: b->free_val = PETSC_TRUE;
158: b->free_colidx = PETSC_TRUE;
159: } else {
160: b->free_val = PETSC_FALSE;
161: b->free_colidx = PETSC_FALSE;
162: }
164: b->nz = 0;
165: b->maxallocrow = maxallocrow;
166: b->rlenmax = maxallocrow;
167: b->maxallocmat = b->sliidx[totalslices];
168: B->info.nz_unneeded = (double)b->maxallocmat;
169: if (realalloc) MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
170: return 0;
171: }
173: PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
174: {
175: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
176: PetscInt shift;
179: if (nz) *nz = a->rlen[row];
180: shift = a->sliidx[row >> 3] + (row & 0x07);
181: if (!a->getrowcols) PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals);
182: if (idx) {
183: PetscInt j;
184: for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
185: *idx = a->getrowcols;
186: }
187: if (v) {
188: PetscInt j;
189: for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
190: *v = a->getrowvals;
191: }
192: return 0;
193: }
195: PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
196: {
197: return 0;
198: }
200: PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
201: {
202: Mat B;
203: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204: PetscInt i;
206: if (reuse == MAT_REUSE_MATRIX) {
207: B = *newmat;
208: MatZeroEntries(B);
209: } else {
210: MatCreate(PetscObjectComm((PetscObject)A), &B);
211: MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N);
212: MatSetType(B, MATSEQAIJ);
213: MatSeqAIJSetPreallocation(B, 0, a->rlen);
214: }
216: for (i = 0; i < A->rmap->n; i++) {
217: PetscInt nz = 0, *cols = NULL;
218: PetscScalar *vals = NULL;
220: MatGetRow_SeqSELL(A, i, &nz, &cols, &vals);
221: MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES);
222: MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals);
223: }
225: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
226: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
227: B->rmap->bs = A->rmap->bs;
229: if (reuse == MAT_INPLACE_MATRIX) {
230: MatHeaderReplace(A, &B);
231: } else {
232: *newmat = B;
233: }
234: return 0;
235: }
237: #include <../src/mat/impls/aij/seq/aij.h>
239: PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
240: {
241: Mat B;
242: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
243: PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
244: const PetscInt *cols;
245: const PetscScalar *vals;
248: if (reuse == MAT_REUSE_MATRIX) {
249: B = *newmat;
250: } else {
251: if (PetscDefined(USE_DEBUG) || !a->ilen) {
252: PetscMalloc1(m, &rowlengths);
253: for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
254: }
255: if (PetscDefined(USE_DEBUG) && a->ilen) {
256: PetscBool eq;
257: PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq);
259: PetscFree(rowlengths);
260: rowlengths = a->ilen;
261: } else if (a->ilen) rowlengths = a->ilen;
262: MatCreate(PetscObjectComm((PetscObject)A), &B);
263: MatSetSizes(B, m, n, m, n);
264: MatSetType(B, MATSEQSELL);
265: MatSeqSELLSetPreallocation(B, 0, rowlengths);
266: if (rowlengths != a->ilen) PetscFree(rowlengths);
267: }
269: for (row = 0; row < m; row++) {
270: MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
271: MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES);
272: MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals);
273: }
274: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
275: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
276: B->rmap->bs = A->rmap->bs;
278: if (reuse == MAT_INPLACE_MATRIX) {
279: MatHeaderReplace(A, &B);
280: } else {
281: *newmat = B;
282: }
283: return 0;
284: }
286: PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
287: {
288: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
289: PetscScalar *y;
290: const PetscScalar *x;
291: const MatScalar *aval = a->val;
292: PetscInt totalslices = a->totalslices;
293: const PetscInt *acolidx = a->colidx;
294: PetscInt i, j;
295: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
296: __m512d vec_x, vec_y, vec_vals;
297: __m256i vec_idx;
298: __mmask8 mask;
299: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
300: __m256i vec_idx2, vec_idx3, vec_idx4;
301: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
302: __m128i vec_idx;
303: __m256d vec_x, vec_y, vec_y2, vec_vals;
304: MatScalar yval;
305: PetscInt r, rows_left, row, nnz_in_row;
306: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
307: __m128d vec_x_tmp;
308: __m256d vec_x, vec_y, vec_y2, vec_vals;
309: MatScalar yval;
310: PetscInt r, rows_left, row, nnz_in_row;
311: #else
312: PetscScalar sum[8];
313: #endif
315: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
316: #pragma disjoint(*x, *y, *aval)
317: #endif
319: VecGetArrayRead(xx, &x);
320: VecGetArray(yy, &y);
321: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322: for (i = 0; i < totalslices; i++) { /* loop over slices */
323: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
324: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
326: vec_y = _mm512_setzero_pd();
327: vec_y2 = _mm512_setzero_pd();
328: vec_y3 = _mm512_setzero_pd();
329: vec_y4 = _mm512_setzero_pd();
331: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
332: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
333: case 3:
334: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
335: acolidx += 8;
336: aval += 8;
337: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
338: acolidx += 8;
339: aval += 8;
340: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
341: acolidx += 8;
342: aval += 8;
343: j += 3;
344: break;
345: case 2:
346: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
347: acolidx += 8;
348: aval += 8;
349: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
350: acolidx += 8;
351: aval += 8;
352: j += 2;
353: break;
354: case 1:
355: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
356: acolidx += 8;
357: aval += 8;
358: j += 1;
359: break;
360: }
361: #pragma novector
362: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
363: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
364: acolidx += 8;
365: aval += 8;
366: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
367: acolidx += 8;
368: aval += 8;
369: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
370: acolidx += 8;
371: aval += 8;
372: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
373: acolidx += 8;
374: aval += 8;
375: }
377: vec_y = _mm512_add_pd(vec_y, vec_y2);
378: vec_y = _mm512_add_pd(vec_y, vec_y3);
379: vec_y = _mm512_add_pd(vec_y, vec_y4);
380: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
381: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
382: _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
383: } else {
384: _mm512_storeu_pd(&y[8 * i], vec_y);
385: }
386: }
387: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
388: for (i = 0; i < totalslices; i++) { /* loop over full slices */
389: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
390: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
392: /* last slice may have padding rows. Don't use vectorization. */
393: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
394: rows_left = A->rmap->n - 8 * i;
395: for (r = 0; r < rows_left; ++r) {
396: yval = (MatScalar)0;
397: row = 8 * i + r;
398: nnz_in_row = a->rlen[row];
399: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
400: y[row] = yval;
401: }
402: break;
403: }
405: vec_y = _mm256_setzero_pd();
406: vec_y2 = _mm256_setzero_pd();
408: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
409: #pragma novector
410: #pragma unroll(2)
411: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
412: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
413: aval += 4;
414: acolidx += 4;
415: AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
416: aval += 4;
417: acolidx += 4;
418: }
420: _mm256_storeu_pd(y + i * 8, vec_y);
421: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
422: }
423: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424: for (i = 0; i < totalslices; i++) { /* loop over full slices */
425: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
426: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428: vec_y = _mm256_setzero_pd();
429: vec_y2 = _mm256_setzero_pd();
431: /* last slice may have padding rows. Don't use vectorization. */
432: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
433: rows_left = A->rmap->n - 8 * i;
434: for (r = 0; r < rows_left; ++r) {
435: yval = (MatScalar)0;
436: row = 8 * i + r;
437: nnz_in_row = a->rlen[row];
438: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
439: y[row] = yval;
440: }
441: break;
442: }
444: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
445: #pragma novector
446: #pragma unroll(2)
447: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
448: vec_vals = _mm256_loadu_pd(aval);
449: vec_x_tmp = _mm_setzero_pd();
450: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
451: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
452: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
453: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
454: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
455: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
456: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
457: aval += 4;
459: vec_vals = _mm256_loadu_pd(aval);
460: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
463: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
464: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
465: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
466: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
467: aval += 4;
468: }
470: _mm256_storeu_pd(y + i * 8, vec_y);
471: _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
472: }
473: #else
474: for (i = 0; i < totalslices; i++) { /* loop over slices */
475: for (j = 0; j < 8; j++) sum[j] = 0.0;
476: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
477: sum[0] += aval[j] * x[acolidx[j]];
478: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
479: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
480: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
481: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
482: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
483: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
484: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
485: }
486: if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
487: for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
488: } else {
489: for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
490: }
491: }
492: #endif
494: PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt); /* theoretical minimal FLOPs */
495: VecRestoreArrayRead(xx, &x);
496: VecRestoreArray(yy, &y);
497: return 0;
498: }
500: #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
501: PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
502: {
503: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
504: PetscScalar *y, *z;
505: const PetscScalar *x;
506: const MatScalar *aval = a->val;
507: PetscInt totalslices = a->totalslices;
508: const PetscInt *acolidx = a->colidx;
509: PetscInt i, j;
510: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
511: __m512d vec_x, vec_y, vec_vals;
512: __m256i vec_idx;
513: __mmask8 mask;
514: __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
515: __m256i vec_idx2, vec_idx3, vec_idx4;
516: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
517: __m128d vec_x_tmp;
518: __m256d vec_x, vec_y, vec_y2, vec_vals;
519: MatScalar yval;
520: PetscInt r, row, nnz_in_row;
521: #else
522: PetscScalar sum[8];
523: #endif
525: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
526: #pragma disjoint(*x, *y, *aval)
527: #endif
529: VecGetArrayRead(xx, &x);
530: VecGetArrayPair(yy, zz, &y, &z);
531: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532: for (i = 0; i < totalslices; i++) { /* loop over slices */
533: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
534: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
536: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537: mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
538: vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
539: } else {
540: vec_y = _mm512_loadu_pd(&y[8 * i]);
541: }
542: vec_y2 = _mm512_setzero_pd();
543: vec_y3 = _mm512_setzero_pd();
544: vec_y4 = _mm512_setzero_pd();
546: j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
547: switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
548: case 3:
549: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
550: acolidx += 8;
551: aval += 8;
552: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
553: acolidx += 8;
554: aval += 8;
555: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
556: acolidx += 8;
557: aval += 8;
558: j += 3;
559: break;
560: case 2:
561: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
562: acolidx += 8;
563: aval += 8;
564: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
565: acolidx += 8;
566: aval += 8;
567: j += 2;
568: break;
569: case 1:
570: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
571: acolidx += 8;
572: aval += 8;
573: j += 1;
574: break;
575: }
576: #pragma novector
577: for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
578: AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
579: acolidx += 8;
580: aval += 8;
581: AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
582: acolidx += 8;
583: aval += 8;
584: AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
585: acolidx += 8;
586: aval += 8;
587: AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
588: acolidx += 8;
589: aval += 8;
590: }
592: vec_y = _mm512_add_pd(vec_y, vec_y2);
593: vec_y = _mm512_add_pd(vec_y, vec_y3);
594: vec_y = _mm512_add_pd(vec_y, vec_y4);
595: if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
596: _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
597: } else {
598: _mm512_storeu_pd(&z[8 * i], vec_y);
599: }
600: }
601: #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
602: for (i = 0; i < totalslices; i++) { /* loop over full slices */
603: PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
604: PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
606: /* last slice may have padding rows. Don't use vectorization. */
607: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
608: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
609: row = 8 * i + r;
610: yval = (MatScalar)0.0;
611: nnz_in_row = a->rlen[row];
612: for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
613: z[row] = y[row] + yval;
614: }
615: break;
616: }
618: vec_y = _mm256_loadu_pd(y + 8 * i);
619: vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
621: /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
622: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
623: vec_vals = _mm256_loadu_pd(aval);
624: vec_x_tmp = _mm_setzero_pd();
625: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627: vec_x = _mm256_setzero_pd();
628: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
629: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
632: vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
633: aval += 4;
635: vec_vals = _mm256_loadu_pd(aval);
636: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
639: vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640: vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641: vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
642: vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
643: aval += 4;
644: }
646: _mm256_storeu_pd(z + i * 8, vec_y);
647: _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
648: }
649: #else
650: for (i = 0; i < totalslices; i++) { /* loop over slices */
651: for (j = 0; j < 8; j++) sum[j] = 0.0;
652: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
653: sum[0] += aval[j] * x[acolidx[j]];
654: sum[1] += aval[j + 1] * x[acolidx[j + 1]];
655: sum[2] += aval[j + 2] * x[acolidx[j + 2]];
656: sum[3] += aval[j + 3] * x[acolidx[j + 3]];
657: sum[4] += aval[j + 4] * x[acolidx[j + 4]];
658: sum[5] += aval[j + 5] * x[acolidx[j + 5]];
659: sum[6] += aval[j + 6] * x[acolidx[j + 6]];
660: sum[7] += aval[j + 7] * x[acolidx[j + 7]];
661: }
662: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
663: for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
664: } else {
665: for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
666: }
667: }
668: #endif
670: PetscLogFlops(2.0 * a->nz);
671: VecRestoreArrayRead(xx, &x);
672: VecRestoreArrayPair(yy, zz, &y, &z);
673: return 0;
674: }
676: PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
677: {
678: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
679: PetscScalar *y;
680: const PetscScalar *x;
681: const MatScalar *aval = a->val;
682: const PetscInt *acolidx = a->colidx;
683: PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices;
685: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
686: #pragma disjoint(*x, *y, *aval)
687: #endif
689: if (A->symmetric == PETSC_BOOL3_TRUE) {
690: MatMultAdd_SeqSELL(A, xx, zz, yy);
691: return 0;
692: }
693: if (zz != yy) VecCopy(zz, yy);
694: VecGetArrayRead(xx, &x);
695: VecGetArray(yy, &y);
696: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
697: if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
698: for (r = 0; r < (A->rmap->n & 0x07); ++r) {
699: row = 8 * i + r;
700: nnz_in_row = a->rlen[row];
701: for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
702: }
703: break;
704: }
705: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
706: y[acolidx[j]] += aval[j] * x[8 * i];
707: y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
708: y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
709: y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
710: y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
711: y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
712: y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
713: y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
714: }
715: }
716: PetscLogFlops(2.0 * a->sliidx[a->totalslices]);
717: VecRestoreArrayRead(xx, &x);
718: VecRestoreArray(yy, &y);
719: return 0;
720: }
722: PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
723: {
724: if (A->symmetric == PETSC_BOOL3_TRUE) {
725: MatMult_SeqSELL(A, xx, yy);
726: } else {
727: VecSet(yy, 0.0);
728: MatMultTransposeAdd_SeqSELL(A, xx, yy, yy);
729: }
730: return 0;
731: }
733: /*
734: Checks for missing diagonals
735: */
736: PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
737: {
738: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
739: PetscInt *diag, i;
741: *missing = PETSC_FALSE;
742: if (A->rmap->n > 0 && !(a->colidx)) {
743: *missing = PETSC_TRUE;
744: if (d) *d = 0;
745: PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n");
746: } else {
747: diag = a->diag;
748: for (i = 0; i < A->rmap->n; i++) {
749: if (diag[i] == -1) {
750: *missing = PETSC_TRUE;
751: if (d) *d = i;
752: PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i);
753: break;
754: }
755: }
756: }
757: return 0;
758: }
760: PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
761: {
762: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
763: PetscInt i, j, m = A->rmap->n, shift;
765: if (!a->diag) {
766: PetscMalloc1(m, &a->diag);
767: a->free_diag = PETSC_TRUE;
768: }
769: for (i = 0; i < m; i++) { /* loop over rows */
770: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
771: a->diag[i] = -1;
772: for (j = 0; j < a->rlen[i]; j++) {
773: if (a->colidx[shift + j * 8] == i) {
774: a->diag[i] = shift + j * 8;
775: break;
776: }
777: }
778: }
779: return 0;
780: }
782: /*
783: Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
784: */
785: PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
786: {
787: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
788: PetscInt i, *diag, m = A->rmap->n;
789: MatScalar *val = a->val;
790: PetscScalar *idiag, *mdiag;
792: if (a->idiagvalid) return 0;
793: MatMarkDiagonal_SeqSELL(A);
794: diag = a->diag;
795: if (!a->idiag) {
796: PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work);
797: val = a->val;
798: }
799: mdiag = a->mdiag;
800: idiag = a->idiag;
802: if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
803: for (i = 0; i < m; i++) {
804: mdiag[i] = val[diag[i]];
805: if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
806: if (PetscRealPart(fshift)) {
807: PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i);
808: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
809: A->factorerror_zeropivot_value = 0.0;
810: A->factorerror_zeropivot_row = i;
811: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
812: }
813: idiag[i] = 1.0 / val[diag[i]];
814: }
815: PetscLogFlops(m);
816: } else {
817: for (i = 0; i < m; i++) {
818: mdiag[i] = val[diag[i]];
819: idiag[i] = omega / (fshift + val[diag[i]]);
820: }
821: PetscLogFlops(2.0 * m);
822: }
823: a->idiagvalid = PETSC_TRUE;
824: return 0;
825: }
827: PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
828: {
829: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
831: PetscArrayzero(a->val, a->sliidx[a->totalslices]);
832: MatSeqSELLInvalidateDiagonal(A);
833: return 0;
834: }
836: PetscErrorCode MatDestroy_SeqSELL(Mat A)
837: {
838: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
840: #if defined(PETSC_USE_LOG)
841: PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
842: #endif
843: MatSeqXSELLFreeSELL(A, &a->val, &a->colidx);
844: ISDestroy(&a->row);
845: ISDestroy(&a->col);
846: PetscFree(a->diag);
847: PetscFree(a->rlen);
848: PetscFree(a->sliidx);
849: PetscFree3(a->idiag, a->mdiag, a->ssor_work);
850: PetscFree(a->solve_work);
851: ISDestroy(&a->icol);
852: PetscFree(a->saved_values);
853: PetscFree2(a->getrowcols, a->getrowvals);
855: PetscFree(A->data);
857: PetscObjectChangeTypeName((PetscObject)A, NULL);
858: PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL);
859: PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL);
860: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL);
861: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL);
862: PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL);
863: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL);
864: return 0;
865: }
867: PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
868: {
869: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
871: switch (op) {
872: case MAT_ROW_ORIENTED:
873: a->roworiented = flg;
874: break;
875: case MAT_KEEP_NONZERO_PATTERN:
876: a->keepnonzeropattern = flg;
877: break;
878: case MAT_NEW_NONZERO_LOCATIONS:
879: a->nonew = (flg ? 0 : 1);
880: break;
881: case MAT_NEW_NONZERO_LOCATION_ERR:
882: a->nonew = (flg ? -1 : 0);
883: break;
884: case MAT_NEW_NONZERO_ALLOCATION_ERR:
885: a->nonew = (flg ? -2 : 0);
886: break;
887: case MAT_UNUSED_NONZERO_LOCATION_ERR:
888: a->nounused = (flg ? -1 : 0);
889: break;
890: case MAT_FORCE_DIAGONAL_ENTRIES:
891: case MAT_IGNORE_OFF_PROC_ENTRIES:
892: case MAT_USE_HASH_TABLE:
893: case MAT_SORTED_FULL:
894: PetscInfo(A, "Option %s ignored\n", MatOptions[op]);
895: break;
896: case MAT_SPD:
897: case MAT_SYMMETRIC:
898: case MAT_STRUCTURALLY_SYMMETRIC:
899: case MAT_HERMITIAN:
900: case MAT_SYMMETRY_ETERNAL:
901: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
902: case MAT_SPD_ETERNAL:
903: /* These options are handled directly by MatSetOption() */
904: break;
905: default:
906: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
907: }
908: return 0;
909: }
911: PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
912: {
913: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
914: PetscInt i, j, n, shift;
915: PetscScalar *x, zero = 0.0;
917: VecGetLocalSize(v, &n);
920: if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
921: PetscInt *diag = a->diag;
922: VecGetArray(v, &x);
923: for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
924: VecRestoreArray(v, &x);
925: return 0;
926: }
928: VecSet(v, zero);
929: VecGetArray(v, &x);
930: for (i = 0; i < n; i++) { /* loop over rows */
931: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
932: x[i] = 0;
933: for (j = 0; j < a->rlen[i]; j++) {
934: if (a->colidx[shift + j * 8] == i) {
935: x[i] = a->val[shift + j * 8];
936: break;
937: }
938: }
939: }
940: VecRestoreArray(v, &x);
941: return 0;
942: }
944: PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
945: {
946: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
947: const PetscScalar *l, *r;
948: PetscInt i, j, m, n, row;
950: if (ll) {
951: /* The local size is used so that VecMPI can be passed to this routine
952: by MatDiagonalScale_MPISELL */
953: VecGetLocalSize(ll, &m);
955: VecGetArrayRead(ll, &l);
956: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
957: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
958: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
959: if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
960: }
961: } else {
962: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
963: }
964: }
965: VecRestoreArrayRead(ll, &l);
966: PetscLogFlops(a->nz);
967: }
968: if (rr) {
969: VecGetLocalSize(rr, &n);
971: VecGetArrayRead(rr, &r);
972: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
973: if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
974: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
975: if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
976: }
977: } else {
978: for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
979: }
980: }
981: VecRestoreArrayRead(rr, &r);
982: PetscLogFlops(a->nz);
983: }
984: MatSeqSELLInvalidateDiagonal(A);
985: return 0;
986: }
988: PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
989: {
990: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
991: PetscInt *cp, i, k, low, high, t, row, col, l;
992: PetscInt shift;
993: MatScalar *vp;
995: for (k = 0; k < m; k++) { /* loop over requested rows */
996: row = im[k];
997: if (row < 0) continue;
999: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1000: cp = a->colidx + shift; /* pointer to the row */
1001: vp = a->val + shift; /* pointer to the row */
1002: for (l = 0; l < n; l++) { /* loop over requested columns */
1003: col = in[l];
1004: if (col < 0) continue;
1006: high = a->rlen[row];
1007: low = 0; /* assume unsorted */
1008: while (high - low > 5) {
1009: t = (low + high) / 2;
1010: if (*(cp + t * 8) > col) high = t;
1011: else low = t;
1012: }
1013: for (i = low; i < high; i++) {
1014: if (*(cp + 8 * i) > col) break;
1015: if (*(cp + 8 * i) == col) {
1016: *v++ = *(vp + 8 * i);
1017: goto finished;
1018: }
1019: }
1020: *v++ = 0.0;
1021: finished:;
1022: }
1023: }
1024: return 0;
1025: }
1027: PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1028: {
1029: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1030: PetscInt i, j, m = A->rmap->n, shift;
1031: const char *name;
1032: PetscViewerFormat format;
1034: PetscViewerGetFormat(viewer, &format);
1035: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1036: PetscInt nofinalvalue = 0;
1037: /*
1038: if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1039: nofinalvalue = 1;
1040: }
1041: */
1042: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1043: PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n);
1044: PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz);
1045: #if defined(PETSC_USE_COMPLEX)
1046: PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue);
1047: #else
1048: PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue);
1049: #endif
1050: PetscViewerASCIIPrintf(viewer, "zzz = [\n");
1052: for (i = 0; i < m; i++) {
1053: shift = a->sliidx[i >> 3] + (i & 0x07);
1054: for (j = 0; j < a->rlen[i]; j++) {
1055: #if defined(PETSC_USE_COMPLEX)
1056: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1057: #else
1058: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]);
1059: #endif
1060: }
1061: }
1062: /*
1063: if (nofinalvalue) {
1064: #if defined(PETSC_USE_COMPLEX)
1065: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.);
1066: #else
1067: PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0);
1068: #endif
1069: }
1070: */
1071: PetscObjectGetName((PetscObject)A, &name);
1072: PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name);
1073: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1074: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1075: return 0;
1076: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1077: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1078: for (i = 0; i < m; i++) {
1079: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1080: shift = a->sliidx[i >> 3] + (i & 0x07);
1081: for (j = 0; j < a->rlen[i]; j++) {
1082: #if defined(PETSC_USE_COMPLEX)
1083: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1084: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1085: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1086: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1087: } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1088: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1089: }
1090: #else
1091: if (a->val[shift + 8 * j] != 0.0) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1092: #endif
1093: }
1094: PetscViewerASCIIPrintf(viewer, "\n");
1095: }
1096: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1097: } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1098: PetscInt cnt = 0, jcnt;
1099: PetscScalar value;
1100: #if defined(PETSC_USE_COMPLEX)
1101: PetscBool realonly = PETSC_TRUE;
1102: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1103: if (PetscImaginaryPart(a->val[i]) != 0.0) {
1104: realonly = PETSC_FALSE;
1105: break;
1106: }
1107: }
1108: #endif
1110: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1111: for (i = 0; i < m; i++) {
1112: jcnt = 0;
1113: shift = a->sliidx[i >> 3] + (i & 0x07);
1114: for (j = 0; j < A->cmap->n; j++) {
1115: if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1116: value = a->val[cnt++];
1117: jcnt++;
1118: } else {
1119: value = 0.0;
1120: }
1121: #if defined(PETSC_USE_COMPLEX)
1122: if (realonly) {
1123: PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value));
1124: } else {
1125: PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value));
1126: }
1127: #else
1128: PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value);
1129: #endif
1130: }
1131: PetscViewerASCIIPrintf(viewer, "\n");
1132: }
1133: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1134: } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1135: PetscInt fshift = 1;
1136: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1137: #if defined(PETSC_USE_COMPLEX)
1138: PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n");
1139: #else
1140: PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n");
1141: #endif
1142: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz);
1143: for (i = 0; i < m; i++) {
1144: shift = a->sliidx[i >> 3] + (i & 0x07);
1145: for (j = 0; j < a->rlen[i]; j++) {
1146: #if defined(PETSC_USE_COMPLEX)
1147: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1148: #else
1149: PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]);
1150: #endif
1151: }
1152: }
1153: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1154: } else if (format == PETSC_VIEWER_NATIVE) {
1155: for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1156: PetscInt row;
1157: PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]);
1158: for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1159: #if defined(PETSC_USE_COMPLEX)
1160: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1161: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1162: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1163: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]));
1164: } else {
1165: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]));
1166: }
1167: #else
1168: PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]);
1169: #endif
1170: }
1171: }
1172: } else {
1173: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1174: if (A->factortype) {
1175: for (i = 0; i < m; i++) {
1176: shift = a->sliidx[i >> 3] + (i & 0x07);
1177: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1178: /* L part */
1179: for (j = shift; j < a->diag[i]; j += 8) {
1180: #if defined(PETSC_USE_COMPLEX)
1181: if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1182: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1183: } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1184: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1185: } else {
1186: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1187: }
1188: #else
1189: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1190: #endif
1191: }
1192: /* diagonal */
1193: j = a->diag[i];
1194: #if defined(PETSC_USE_COMPLEX)
1195: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1196: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]));
1197: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1198: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])));
1199: } else {
1200: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]));
1201: }
1202: #else
1203: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j]));
1204: #endif
1206: /* U part */
1207: for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1208: #if defined(PETSC_USE_COMPLEX)
1209: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1210: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]));
1211: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1212: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])));
1213: } else {
1214: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]));
1215: }
1216: #else
1217: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]);
1218: #endif
1219: }
1220: PetscViewerASCIIPrintf(viewer, "\n");
1221: }
1222: } else {
1223: for (i = 0; i < m; i++) {
1224: shift = a->sliidx[i >> 3] + (i & 0x07);
1225: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i);
1226: for (j = 0; j < a->rlen[i]; j++) {
1227: #if defined(PETSC_USE_COMPLEX)
1228: if (PetscImaginaryPart(a->val[j]) > 0.0) {
1229: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j]));
1230: } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1231: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j]));
1232: } else {
1233: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]));
1234: }
1235: #else
1236: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]);
1237: #endif
1238: }
1239: PetscViewerASCIIPrintf(viewer, "\n");
1240: }
1241: }
1242: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1243: }
1244: PetscViewerFlush(viewer);
1245: return 0;
1246: }
1248: #include <petscdraw.h>
1249: PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1250: {
1251: Mat A = (Mat)Aa;
1252: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1253: PetscInt i, j, m = A->rmap->n, shift;
1254: int color;
1255: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1256: PetscViewer viewer;
1257: PetscViewerFormat format;
1259: PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer);
1260: PetscViewerGetFormat(viewer, &format);
1261: PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr);
1263: /* loop over matrix elements drawing boxes */
1265: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1266: PetscDrawCollectiveBegin(draw);
1267: /* Blue for negative, Cyan for zero and Red for positive */
1268: color = PETSC_DRAW_BLUE;
1269: for (i = 0; i < m; i++) {
1270: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1271: y_l = m - i - 1.0;
1272: y_r = y_l + 1.0;
1273: for (j = 0; j < a->rlen[i]; j++) {
1274: x_l = a->colidx[shift + j * 8];
1275: x_r = x_l + 1.0;
1276: if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1277: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1278: }
1279: }
1280: color = PETSC_DRAW_CYAN;
1281: for (i = 0; i < m; i++) {
1282: shift = a->sliidx[i >> 3] + (i & 0x07);
1283: y_l = m - i - 1.0;
1284: y_r = y_l + 1.0;
1285: for (j = 0; j < a->rlen[i]; j++) {
1286: x_l = a->colidx[shift + j * 8];
1287: x_r = x_l + 1.0;
1288: if (a->val[shift + 8 * j] != 0.) continue;
1289: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1290: }
1291: }
1292: color = PETSC_DRAW_RED;
1293: for (i = 0; i < m; i++) {
1294: shift = a->sliidx[i >> 3] + (i & 0x07);
1295: y_l = m - i - 1.0;
1296: y_r = y_l + 1.0;
1297: for (j = 0; j < a->rlen[i]; j++) {
1298: x_l = a->colidx[shift + j * 8];
1299: x_r = x_l + 1.0;
1300: if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1301: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1302: }
1303: }
1304: PetscDrawCollectiveEnd(draw);
1305: } else {
1306: /* use contour shading to indicate magnitude of values */
1307: /* first determine max of all nonzero values */
1308: PetscReal minv = 0.0, maxv = 0.0;
1309: PetscInt count = 0;
1310: PetscDraw popup;
1311: for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1312: if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1313: }
1314: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1315: PetscDrawGetPopup(draw, &popup);
1316: PetscDrawScalePopup(popup, minv, maxv);
1318: PetscDrawCollectiveBegin(draw);
1319: for (i = 0; i < m; i++) {
1320: shift = a->sliidx[i >> 3] + (i & 0x07);
1321: y_l = m - i - 1.0;
1322: y_r = y_l + 1.0;
1323: for (j = 0; j < a->rlen[i]; j++) {
1324: x_l = a->colidx[shift + j * 8];
1325: x_r = x_l + 1.0;
1326: color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1327: PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color);
1328: count++;
1329: }
1330: }
1331: PetscDrawCollectiveEnd(draw);
1332: }
1333: return 0;
1334: }
1336: #include <petscdraw.h>
1337: PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1338: {
1339: PetscDraw draw;
1340: PetscReal xr, yr, xl, yl, h, w;
1341: PetscBool isnull;
1343: PetscViewerDrawGetDraw(viewer, 0, &draw);
1344: PetscDrawIsNull(draw, &isnull);
1345: if (isnull) return 0;
1347: xr = A->cmap->n;
1348: yr = A->rmap->n;
1349: h = yr / 10.0;
1350: w = xr / 10.0;
1351: xr += w;
1352: yr += h;
1353: xl = -w;
1354: yl = -h;
1355: PetscDrawSetCoordinates(draw, xl, yl, xr, yr);
1356: PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer);
1357: PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A);
1358: PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL);
1359: PetscDrawSave(draw);
1360: return 0;
1361: }
1363: PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1364: {
1365: PetscBool iascii, isbinary, isdraw;
1367: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
1368: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
1369: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw);
1370: if (iascii) {
1371: MatView_SeqSELL_ASCII(A, viewer);
1372: } else if (isbinary) {
1373: /* MatView_SeqSELL_Binary(A,viewer); */
1374: } else if (isdraw) MatView_SeqSELL_Draw(A, viewer);
1375: return 0;
1376: }
1378: PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1379: {
1380: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1381: PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1382: MatScalar *vp;
1384: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
1385: /* To do: compress out the unused elements */
1386: MatMarkDiagonal_SeqSELL(A);
1387: PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz);
1388: PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs);
1389: PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax);
1390: /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1391: for (i = 0; i < a->totalslices; ++i) {
1392: shift = a->sliidx[i]; /* starting index of the slice */
1393: cp = a->colidx + shift; /* pointer to the column indices of the slice */
1394: vp = a->val + shift; /* pointer to the nonzero values of the slice */
1395: for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1396: row = 8 * i + row_in_slice;
1397: nrow = a->rlen[row]; /* number of nonzeros in row */
1398: /*
1399: Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1400: But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1401: */
1402: lastcol = 0;
1403: if (nrow > 0) { /* nonempty row */
1404: lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1405: } else if (!row_in_slice) { /* first row of the currect slice is empty */
1406: for (j = 1; j < 8; j++) {
1407: if (a->rlen[8 * i + j]) {
1408: lastcol = cp[j];
1409: break;
1410: }
1411: }
1412: } else {
1413: if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1414: }
1416: for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1417: cp[8 * k + row_in_slice] = lastcol;
1418: vp[8 * k + row_in_slice] = (MatScalar)0;
1419: }
1420: }
1421: }
1423: A->info.mallocs += a->reallocs;
1424: a->reallocs = 0;
1426: MatSeqSELLInvalidateDiagonal(A);
1427: return 0;
1428: }
1430: PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1431: {
1432: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1434: info->block_size = 1.0;
1435: info->nz_allocated = a->maxallocmat;
1436: info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */
1437: info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]);
1438: info->assemblies = A->num_ass;
1439: info->mallocs = A->info.mallocs;
1440: info->memory = 0; /* REVIEW ME */
1441: if (A->factortype) {
1442: info->fill_ratio_given = A->info.fill_ratio_given;
1443: info->fill_ratio_needed = A->info.fill_ratio_needed;
1444: info->factor_mallocs = A->info.factor_mallocs;
1445: } else {
1446: info->fill_ratio_given = 0;
1447: info->fill_ratio_needed = 0;
1448: info->factor_mallocs = 0;
1449: }
1450: return 0;
1451: }
1453: PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1454: {
1455: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1456: PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow;
1457: PetscInt *cp, nonew = a->nonew, lastcol = -1;
1458: MatScalar *vp, value;
1460: for (k = 0; k < m; k++) { /* loop over added rows */
1461: row = im[k];
1462: if (row < 0) continue;
1464: shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1465: cp = a->colidx + shift; /* pointer to the row */
1466: vp = a->val + shift; /* pointer to the row */
1467: nrow = a->rlen[row];
1468: low = 0;
1469: high = nrow;
1471: for (l = 0; l < n; l++) { /* loop over added columns */
1472: col = in[l];
1473: if (col < 0) continue;
1475: if (a->roworiented) {
1476: value = v[l + k * n];
1477: } else {
1478: value = v[k + l * m];
1479: }
1480: if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1482: /* search in this row for the specified column, i indicates the column to be set */
1483: if (col <= lastcol) low = 0;
1484: else high = nrow;
1485: lastcol = col;
1486: while (high - low > 5) {
1487: t = (low + high) / 2;
1488: if (*(cp + t * 8) > col) high = t;
1489: else low = t;
1490: }
1491: for (i = low; i < high; i++) {
1492: if (*(cp + i * 8) > col) break;
1493: if (*(cp + i * 8) == col) {
1494: if (is == ADD_VALUES) *(vp + i * 8) += value;
1495: else *(vp + i * 8) = value;
1496: low = i + 1;
1497: goto noinsert;
1498: }
1499: }
1500: if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1501: if (nonew == 1) goto noinsert;
1503: /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1504: MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1505: /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1506: for (ii = nrow - 1; ii >= i; ii--) {
1507: *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1508: *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1509: }
1510: a->rlen[row]++;
1511: *(cp + i * 8) = col;
1512: *(vp + i * 8) = value;
1513: a->nz++;
1514: A->nonzerostate++;
1515: low = i + 1;
1516: high++;
1517: nrow++;
1518: noinsert:;
1519: }
1520: a->rlen[row] = nrow;
1521: }
1522: return 0;
1523: }
1525: PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1526: {
1527: /* If the two matrices have the same copy implementation, use fast copy. */
1528: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1529: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1530: Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1533: PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]);
1534: } else {
1535: MatCopy_Basic(A, B, str);
1536: }
1537: return 0;
1538: }
1540: PetscErrorCode MatSetUp_SeqSELL(Mat A)
1541: {
1542: MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL);
1543: return 0;
1544: }
1546: PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1547: {
1548: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1550: *array = a->val;
1551: return 0;
1552: }
1554: PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1555: {
1556: return 0;
1557: }
1559: PetscErrorCode MatRealPart_SeqSELL(Mat A)
1560: {
1561: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1562: PetscInt i;
1563: MatScalar *aval = a->val;
1565: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1566: return 0;
1567: }
1569: PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1570: {
1571: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1572: PetscInt i;
1573: MatScalar *aval = a->val;
1575: for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1576: MatSeqSELLInvalidateDiagonal(A);
1577: return 0;
1578: }
1580: PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1581: {
1582: Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data;
1583: MatScalar *aval = a->val;
1584: PetscScalar oalpha = alpha;
1585: PetscBLASInt one = 1, size;
1587: PetscBLASIntCast(a->sliidx[a->totalslices], &size);
1588: PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1589: PetscLogFlops(a->nz);
1590: MatSeqSELLInvalidateDiagonal(inA);
1591: return 0;
1592: }
1594: PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1595: {
1596: Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1598: if (!Y->preallocated || !y->nz) MatSeqSELLSetPreallocation(Y, 1, NULL);
1599: MatShift_Basic(Y, a);
1600: return 0;
1601: }
1603: PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1604: {
1605: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1606: PetscScalar *x, sum, *t;
1607: const MatScalar *idiag = NULL, *mdiag;
1608: const PetscScalar *b, *xb;
1609: PetscInt n, m = A->rmap->n, i, j, shift;
1610: const PetscInt *diag;
1612: its = its * lits;
1614: if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1615: if (!a->idiagvalid) MatInvertDiagonal_SeqSELL(A, omega, fshift);
1616: a->fshift = fshift;
1617: a->omega = omega;
1619: diag = a->diag;
1620: t = a->ssor_work;
1621: idiag = a->idiag;
1622: mdiag = a->mdiag;
1624: VecGetArray(xx, &x);
1625: VecGetArrayRead(bb, &b);
1626: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1631: if (flag & SOR_ZERO_INITIAL_GUESS) {
1632: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1633: for (i = 0; i < m; i++) {
1634: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1635: sum = b[i];
1636: n = (diag[i] - shift) / 8;
1637: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1638: t[i] = sum;
1639: x[i] = sum * idiag[i];
1640: }
1641: xb = t;
1642: PetscLogFlops(a->nz);
1643: } else xb = b;
1644: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1645: for (i = m - 1; i >= 0; i--) {
1646: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1647: sum = xb[i];
1648: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1649: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1650: if (xb == b) {
1651: x[i] = sum * idiag[i];
1652: } else {
1653: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1654: }
1655: }
1656: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1657: }
1658: its--;
1659: }
1660: while (its--) {
1661: if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1662: for (i = 0; i < m; i++) {
1663: /* lower */
1664: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1665: sum = b[i];
1666: n = (diag[i] - shift) / 8;
1667: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1668: t[i] = sum; /* save application of the lower-triangular part */
1669: /* upper */
1670: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1671: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1672: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1673: }
1674: xb = t;
1675: PetscLogFlops(2.0 * a->nz);
1676: } else xb = b;
1677: if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1678: for (i = m - 1; i >= 0; i--) {
1679: shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1680: sum = xb[i];
1681: if (xb == b) {
1682: /* whole matrix (no checkpointing available) */
1683: n = a->rlen[i];
1684: for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1685: x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1686: } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1687: n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1688: for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1689: x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1690: }
1691: }
1692: if (xb == b) {
1693: PetscLogFlops(2.0 * a->nz);
1694: } else {
1695: PetscLogFlops(a->nz); /* assumes 1/2 in upper */
1696: }
1697: }
1698: }
1699: VecRestoreArray(xx, &x);
1700: VecRestoreArrayRead(bb, &b);
1701: return 0;
1702: }
1704: /* -------------------------------------------------------------------*/
1705: static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1706: MatGetRow_SeqSELL,
1707: MatRestoreRow_SeqSELL,
1708: MatMult_SeqSELL,
1709: /* 4*/ MatMultAdd_SeqSELL,
1710: MatMultTranspose_SeqSELL,
1711: MatMultTransposeAdd_SeqSELL,
1712: NULL,
1713: NULL,
1714: NULL,
1715: /* 10*/ NULL,
1716: NULL,
1717: NULL,
1718: MatSOR_SeqSELL,
1719: NULL,
1720: /* 15*/ MatGetInfo_SeqSELL,
1721: MatEqual_SeqSELL,
1722: MatGetDiagonal_SeqSELL,
1723: MatDiagonalScale_SeqSELL,
1724: NULL,
1725: /* 20*/ NULL,
1726: MatAssemblyEnd_SeqSELL,
1727: MatSetOption_SeqSELL,
1728: MatZeroEntries_SeqSELL,
1729: /* 24*/ NULL,
1730: NULL,
1731: NULL,
1732: NULL,
1733: NULL,
1734: /* 29*/ MatSetUp_SeqSELL,
1735: NULL,
1736: NULL,
1737: NULL,
1738: NULL,
1739: /* 34*/ MatDuplicate_SeqSELL,
1740: NULL,
1741: NULL,
1742: NULL,
1743: NULL,
1744: /* 39*/ NULL,
1745: NULL,
1746: NULL,
1747: MatGetValues_SeqSELL,
1748: MatCopy_SeqSELL,
1749: /* 44*/ NULL,
1750: MatScale_SeqSELL,
1751: MatShift_SeqSELL,
1752: NULL,
1753: NULL,
1754: /* 49*/ NULL,
1755: NULL,
1756: NULL,
1757: NULL,
1758: NULL,
1759: /* 54*/ MatFDColoringCreate_SeqXAIJ,
1760: NULL,
1761: NULL,
1762: NULL,
1763: NULL,
1764: /* 59*/ NULL,
1765: MatDestroy_SeqSELL,
1766: MatView_SeqSELL,
1767: NULL,
1768: NULL,
1769: /* 64*/ NULL,
1770: NULL,
1771: NULL,
1772: NULL,
1773: NULL,
1774: /* 69*/ NULL,
1775: NULL,
1776: NULL,
1777: NULL,
1778: NULL,
1779: /* 74*/ NULL,
1780: MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1781: NULL,
1782: NULL,
1783: NULL,
1784: /* 79*/ NULL,
1785: NULL,
1786: NULL,
1787: NULL,
1788: NULL,
1789: /* 84*/ NULL,
1790: NULL,
1791: NULL,
1792: NULL,
1793: NULL,
1794: /* 89*/ NULL,
1795: NULL,
1796: NULL,
1797: NULL,
1798: NULL,
1799: /* 94*/ NULL,
1800: NULL,
1801: NULL,
1802: NULL,
1803: NULL,
1804: /* 99*/ NULL,
1805: NULL,
1806: NULL,
1807: MatConjugate_SeqSELL,
1808: NULL,
1809: /*104*/ NULL,
1810: NULL,
1811: NULL,
1812: NULL,
1813: NULL,
1814: /*109*/ NULL,
1815: NULL,
1816: NULL,
1817: NULL,
1818: MatMissingDiagonal_SeqSELL,
1819: /*114*/ NULL,
1820: NULL,
1821: NULL,
1822: NULL,
1823: NULL,
1824: /*119*/ NULL,
1825: NULL,
1826: NULL,
1827: NULL,
1828: NULL,
1829: /*124*/ NULL,
1830: NULL,
1831: NULL,
1832: NULL,
1833: NULL,
1834: /*129*/ NULL,
1835: NULL,
1836: NULL,
1837: NULL,
1838: NULL,
1839: /*134*/ NULL,
1840: NULL,
1841: NULL,
1842: NULL,
1843: NULL,
1844: /*139*/ NULL,
1845: NULL,
1846: NULL,
1847: MatFDColoringSetUp_SeqXAIJ,
1848: NULL,
1849: /*144*/ NULL,
1850: NULL,
1851: NULL,
1852: NULL,
1853: NULL,
1854: NULL,
1855: /*150*/ NULL};
1857: PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1858: {
1859: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1863: /* allocate space for values if not already there */
1864: if (!a->saved_values) { PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values); }
1866: /* copy values over */
1867: PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]);
1868: return 0;
1869: }
1871: PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1872: {
1873: Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1877: PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]);
1878: return 0;
1879: }
1881: /*@C
1882: MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1884: Not Collective
1886: Input Parameters:
1887: . mat - a `MATSEQSELL` matrix
1888: . array - pointer to the data
1890: Level: intermediate
1892: .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1893: @*/
1894: PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1895: {
1896: PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1897: return 0;
1898: }
1900: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1901: {
1902: Mat_SeqSELL *b;
1903: PetscMPIInt size;
1905: PetscCitationsRegister(citation, &cited);
1906: MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
1909: PetscNew(&b);
1911: B->data = (void *)b;
1913: PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps));
1915: b->row = NULL;
1916: b->col = NULL;
1917: b->icol = NULL;
1918: b->reallocs = 0;
1919: b->ignorezeroentries = PETSC_FALSE;
1920: b->roworiented = PETSC_TRUE;
1921: b->nonew = 0;
1922: b->diag = NULL;
1923: b->solve_work = NULL;
1924: B->spptr = NULL;
1925: b->saved_values = NULL;
1926: b->idiag = NULL;
1927: b->mdiag = NULL;
1928: b->ssor_work = NULL;
1929: b->omega = 1.0;
1930: b->fshift = 0.0;
1931: b->idiagvalid = PETSC_FALSE;
1932: b->keepnonzeropattern = PETSC_FALSE;
1934: PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL);
1935: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL);
1936: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL);
1937: PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL);
1938: PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL);
1939: PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL);
1940: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ);
1941: return 0;
1942: }
1944: /*
1945: Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1946: */
1947: PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1948: {
1949: Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1950: PetscInt i, m = A->rmap->n;
1951: PetscInt totalslices = a->totalslices;
1953: C->factortype = A->factortype;
1954: c->row = NULL;
1955: c->col = NULL;
1956: c->icol = NULL;
1957: c->reallocs = 0;
1958: C->assembled = PETSC_TRUE;
1960: PetscLayoutReference(A->rmap, &C->rmap);
1961: PetscLayoutReference(A->cmap, &C->cmap);
1963: PetscMalloc1(8 * totalslices, &c->rlen);
1964: PetscMalloc1(totalslices + 1, &c->sliidx);
1966: for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1967: for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
1969: /* allocate the matrix space */
1970: if (mallocmatspace) {
1971: PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx);
1973: c->singlemalloc = PETSC_TRUE;
1975: if (m > 0) {
1976: PetscArraycpy(c->colidx, a->colidx, a->maxallocmat);
1977: if (cpvalues == MAT_COPY_VALUES) {
1978: PetscArraycpy(c->val, a->val, a->maxallocmat);
1979: } else {
1980: PetscArrayzero(c->val, a->maxallocmat);
1981: }
1982: }
1983: }
1985: c->ignorezeroentries = a->ignorezeroentries;
1986: c->roworiented = a->roworiented;
1987: c->nonew = a->nonew;
1988: if (a->diag) {
1989: PetscMalloc1(m, &c->diag);
1990: for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
1991: } else c->diag = NULL;
1993: c->solve_work = NULL;
1994: c->saved_values = NULL;
1995: c->idiag = NULL;
1996: c->ssor_work = NULL;
1997: c->keepnonzeropattern = a->keepnonzeropattern;
1998: c->free_val = PETSC_TRUE;
1999: c->free_colidx = PETSC_TRUE;
2001: c->maxallocmat = a->maxallocmat;
2002: c->maxallocrow = a->maxallocrow;
2003: c->rlenmax = a->rlenmax;
2004: c->nz = a->nz;
2005: C->preallocated = PETSC_TRUE;
2007: c->nonzerorowcnt = a->nonzerorowcnt;
2008: C->nonzerostate = A->nonzerostate;
2010: PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist);
2011: return 0;
2012: }
2014: PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2015: {
2016: MatCreate(PetscObjectComm((PetscObject)A), B);
2017: MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
2018: if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) MatSetBlockSizesFromMats(*B, A, A);
2019: MatSetType(*B, ((PetscObject)A)->type_name);
2020: MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE);
2021: return 0;
2022: }
2024: /*MC
2025: MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2026: based on the sliced Ellpack format
2028: Options Database Keys:
2029: . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2031: Level: beginner
2033: .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2034: M*/
2036: /*MC
2037: MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2039: This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2040: and `MATMPISELL` otherwise. As a result, for single process communicators,
2041: `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2042: for communicators controlling multiple processes. It is recommended that you call both of
2043: the above preallocation routines for simplicity.
2045: Options Database Keys:
2046: . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2048: Level: beginner
2050: Notes:
2051: This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2053: It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2054: non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2056: Developer Notes:
2057: On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2059: The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2060: .vb
2061: (2 0 3 4)
2062: Consider the matrix A = (5 0 6 0)
2063: (0 0 7 8)
2064: (0 0 9 9)
2066: symbolically the Ellpack format can be written as
2068: (2 3 4 |) (0 2 3 |)
2069: v = (5 6 0 |) colidx = (0 2 2 |)
2070: -------- ---------
2071: (7 8 |) (2 3 |)
2072: (9 9 |) (2 3 |)
2074: The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2075: Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2076: zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2078: The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3)
2080: .ve
2082: See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2084: References:
2085: . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2086: Proceedings of the 47th International Conference on Parallel Processing, 2018.
2088: .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2089: M*/
2091: /*@C
2092: MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2094: Collective on comm
2096: Input Parameters:
2097: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2098: . m - number of rows
2099: . n - number of columns
2100: . rlenmax - maximum number of nonzeros in a row
2101: - rlen - array containing the number of nonzeros in the various rows
2102: (possibly different for each row) or NULL
2104: Output Parameter:
2105: . A - the matrix
2107: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2108: MatXXXXSetPreallocation() paradigm instead of this routine directly.
2109: [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2111: Notes:
2112: If nnz is given then nz is ignored
2114: Specify the preallocated storage with either rlenmax or rlen (not both).
2115: Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2116: allocation. For large problems you MUST preallocate memory or you
2117: will get TERRIBLE performance, see the users' manual chapter on matrices.
2119: Level: intermediate
2121: .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2122: @*/
2123: PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2124: {
2125: MatCreate(comm, A);
2126: MatSetSizes(*A, m, n, m, n);
2127: MatSetType(*A, MATSEQSELL);
2128: MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen);
2129: return 0;
2130: }
2132: PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2133: {
2134: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2135: PetscInt totalslices = a->totalslices;
2137: /* If the matrix dimensions are not equal,or no of nonzeros */
2138: if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2139: *flg = PETSC_FALSE;
2140: return 0;
2141: }
2142: /* if the a->colidx are the same */
2143: PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg);
2144: if (!*flg) return 0;
2145: /* if a->val are the same */
2146: PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg);
2147: return 0;
2148: }
2150: PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2151: {
2152: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2154: a->idiagvalid = PETSC_FALSE;
2155: return 0;
2156: }
2158: PetscErrorCode MatConjugate_SeqSELL(Mat A)
2159: {
2160: #if defined(PETSC_USE_COMPLEX)
2161: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2162: PetscInt i;
2163: PetscScalar *val = a->val;
2165: for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2166: #else
2167: #endif
2168: return 0;
2169: }