Actual source code: baij.c
2: /*
3: Defines the basic matrix operations for the BAIJ (compressed row)
4: matrix storage format.
5: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petscblaslapack.h>
8: #include <petsc/private/kernels/blockinvert.h>
9: #include <petsc/private/kernels/blockmatmult.h>
11: #if defined(PETSC_HAVE_HYPRE)
12: PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
13: #endif
15: #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
16: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
17: #endif
18: PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
20: PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions)
21: {
22: Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
23: PetscInt m, n, i;
24: PetscInt ib, jb, bs = A->rmap->bs;
25: MatScalar *a_val = a_aij->a;
27: MatGetSize(A, &m, &n);
28: for (i = 0; i < n; i++) reductions[i] = 0.0;
29: if (type == NORM_2) {
30: for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
31: for (jb = 0; jb < bs; jb++) {
32: for (ib = 0; ib < bs; ib++) {
33: reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
34: a_val++;
35: }
36: }
37: }
38: } else if (type == NORM_1) {
39: for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
40: for (jb = 0; jb < bs; jb++) {
41: for (ib = 0; ib < bs; ib++) {
42: reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
43: a_val++;
44: }
45: }
46: }
47: } else if (type == NORM_INFINITY) {
48: for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
49: for (jb = 0; jb < bs; jb++) {
50: for (ib = 0; ib < bs; ib++) {
51: int col = A->cmap->rstart + a_aij->j[i] * bs + jb;
52: reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
53: a_val++;
54: }
55: }
56: }
57: } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
58: for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
59: for (jb = 0; jb < bs; jb++) {
60: for (ib = 0; ib < bs; ib++) {
61: reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
62: a_val++;
63: }
64: }
65: }
66: } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
67: for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
68: for (jb = 0; jb < bs; jb++) {
69: for (ib = 0; ib < bs; ib++) {
70: reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
71: a_val++;
72: }
73: }
74: }
75: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
76: if (type == NORM_2) {
77: for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
78: } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
79: for (i = 0; i < n; i++) reductions[i] /= m;
80: }
81: return 0;
82: }
84: PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values)
85: {
86: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
87: PetscInt *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
88: MatScalar *v = a->a, *odiag, *diag, work[25], *v_work;
89: PetscReal shift = 0.0;
90: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
92: allowzeropivot = PetscNot(A->erroriffailure);
94: if (a->idiagvalid) {
95: if (values) *values = a->idiag;
96: return 0;
97: }
98: MatMarkDiagonal_SeqBAIJ(A);
99: diag_offset = a->diag;
100: if (!a->idiag) { PetscMalloc1(bs2 * mbs, &a->idiag); }
101: diag = a->idiag;
102: if (values) *values = a->idiag;
103: /* factor and invert each block */
104: switch (bs) {
105: case 1:
106: for (i = 0; i < mbs; i++) {
107: odiag = v + 1 * diag_offset[i];
108: diag[0] = odiag[0];
110: if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
111: if (allowzeropivot) {
112: A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
113: A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
114: A->factorerror_zeropivot_row = i;
115: PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i);
116: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
117: }
119: diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
120: diag += 1;
121: }
122: break;
123: case 2:
124: for (i = 0; i < mbs; i++) {
125: odiag = v + 4 * diag_offset[i];
126: diag[0] = odiag[0];
127: diag[1] = odiag[1];
128: diag[2] = odiag[2];
129: diag[3] = odiag[3];
130: PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected);
131: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
132: diag += 4;
133: }
134: break;
135: case 3:
136: for (i = 0; i < mbs; i++) {
137: odiag = v + 9 * diag_offset[i];
138: diag[0] = odiag[0];
139: diag[1] = odiag[1];
140: diag[2] = odiag[2];
141: diag[3] = odiag[3];
142: diag[4] = odiag[4];
143: diag[5] = odiag[5];
144: diag[6] = odiag[6];
145: diag[7] = odiag[7];
146: diag[8] = odiag[8];
147: PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected);
148: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
149: diag += 9;
150: }
151: break;
152: case 4:
153: for (i = 0; i < mbs; i++) {
154: odiag = v + 16 * diag_offset[i];
155: PetscArraycpy(diag, odiag, 16);
156: PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected);
157: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
158: diag += 16;
159: }
160: break;
161: case 5:
162: for (i = 0; i < mbs; i++) {
163: odiag = v + 25 * diag_offset[i];
164: PetscArraycpy(diag, odiag, 25);
165: PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected);
166: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
167: diag += 25;
168: }
169: break;
170: case 6:
171: for (i = 0; i < mbs; i++) {
172: odiag = v + 36 * diag_offset[i];
173: PetscArraycpy(diag, odiag, 36);
174: PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected);
175: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
176: diag += 36;
177: }
178: break;
179: case 7:
180: for (i = 0; i < mbs; i++) {
181: odiag = v + 49 * diag_offset[i];
182: PetscArraycpy(diag, odiag, 49);
183: PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected);
184: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
185: diag += 49;
186: }
187: break;
188: default:
189: PetscMalloc2(bs, &v_work, bs, &v_pivots);
190: for (i = 0; i < mbs; i++) {
191: odiag = v + bs2 * diag_offset[i];
192: PetscArraycpy(diag, odiag, bs2);
193: PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected);
194: if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
195: diag += bs2;
196: }
197: PetscFree2(v_work, v_pivots);
198: }
199: a->idiagvalid = PETSC_TRUE;
200: return 0;
201: }
203: PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
204: {
205: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
206: PetscScalar *x, *work, *w, *workt, *t;
207: const MatScalar *v, *aa = a->a, *idiag;
208: const PetscScalar *b, *xb;
209: PetscScalar s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
210: PetscInt m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
211: const PetscInt *diag, *ai = a->i, *aj = a->j, *vi;
213: its = its * lits;
220: if (!a->idiagvalid) MatInvertBlockDiagonal(A, NULL);
222: if (!m) return 0;
223: diag = a->diag;
224: idiag = a->idiag;
225: k = PetscMax(A->rmap->n, A->cmap->n);
226: if (!a->mult_work) PetscMalloc1(k + 1, &a->mult_work);
227: if (!a->sor_workt) PetscMalloc1(k, &a->sor_workt);
228: if (!a->sor_work) PetscMalloc1(bs, &a->sor_work);
229: work = a->mult_work;
230: t = a->sor_workt;
231: w = a->sor_work;
233: VecGetArray(xx, &x);
234: VecGetArrayRead(bb, &b);
236: if (flag & SOR_ZERO_INITIAL_GUESS) {
237: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
238: switch (bs) {
239: case 1:
240: PetscKernel_v_gets_A_times_w_1(x, idiag, b);
241: t[0] = b[0];
242: i2 = 1;
243: idiag += 1;
244: for (i = 1; i < m; i++) {
245: v = aa + ai[i];
246: vi = aj + ai[i];
247: nz = diag[i] - ai[i];
248: s[0] = b[i2];
249: for (j = 0; j < nz; j++) {
250: xw[0] = x[vi[j]];
251: PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
252: }
253: t[i2] = s[0];
254: PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
255: x[i2] = xw[0];
256: idiag += 1;
257: i2 += 1;
258: }
259: break;
260: case 2:
261: PetscKernel_v_gets_A_times_w_2(x, idiag, b);
262: t[0] = b[0];
263: t[1] = b[1];
264: i2 = 2;
265: idiag += 4;
266: for (i = 1; i < m; i++) {
267: v = aa + 4 * ai[i];
268: vi = aj + ai[i];
269: nz = diag[i] - ai[i];
270: s[0] = b[i2];
271: s[1] = b[i2 + 1];
272: for (j = 0; j < nz; j++) {
273: idx = 2 * vi[j];
274: it = 4 * j;
275: xw[0] = x[idx];
276: xw[1] = x[1 + idx];
277: PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
278: }
279: t[i2] = s[0];
280: t[i2 + 1] = s[1];
281: PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
282: x[i2] = xw[0];
283: x[i2 + 1] = xw[1];
284: idiag += 4;
285: i2 += 2;
286: }
287: break;
288: case 3:
289: PetscKernel_v_gets_A_times_w_3(x, idiag, b);
290: t[0] = b[0];
291: t[1] = b[1];
292: t[2] = b[2];
293: i2 = 3;
294: idiag += 9;
295: for (i = 1; i < m; i++) {
296: v = aa + 9 * ai[i];
297: vi = aj + ai[i];
298: nz = diag[i] - ai[i];
299: s[0] = b[i2];
300: s[1] = b[i2 + 1];
301: s[2] = b[i2 + 2];
302: while (nz--) {
303: idx = 3 * (*vi++);
304: xw[0] = x[idx];
305: xw[1] = x[1 + idx];
306: xw[2] = x[2 + idx];
307: PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
308: v += 9;
309: }
310: t[i2] = s[0];
311: t[i2 + 1] = s[1];
312: t[i2 + 2] = s[2];
313: PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
314: x[i2] = xw[0];
315: x[i2 + 1] = xw[1];
316: x[i2 + 2] = xw[2];
317: idiag += 9;
318: i2 += 3;
319: }
320: break;
321: case 4:
322: PetscKernel_v_gets_A_times_w_4(x, idiag, b);
323: t[0] = b[0];
324: t[1] = b[1];
325: t[2] = b[2];
326: t[3] = b[3];
327: i2 = 4;
328: idiag += 16;
329: for (i = 1; i < m; i++) {
330: v = aa + 16 * ai[i];
331: vi = aj + ai[i];
332: nz = diag[i] - ai[i];
333: s[0] = b[i2];
334: s[1] = b[i2 + 1];
335: s[2] = b[i2 + 2];
336: s[3] = b[i2 + 3];
337: while (nz--) {
338: idx = 4 * (*vi++);
339: xw[0] = x[idx];
340: xw[1] = x[1 + idx];
341: xw[2] = x[2 + idx];
342: xw[3] = x[3 + idx];
343: PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
344: v += 16;
345: }
346: t[i2] = s[0];
347: t[i2 + 1] = s[1];
348: t[i2 + 2] = s[2];
349: t[i2 + 3] = s[3];
350: PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
351: x[i2] = xw[0];
352: x[i2 + 1] = xw[1];
353: x[i2 + 2] = xw[2];
354: x[i2 + 3] = xw[3];
355: idiag += 16;
356: i2 += 4;
357: }
358: break;
359: case 5:
360: PetscKernel_v_gets_A_times_w_5(x, idiag, b);
361: t[0] = b[0];
362: t[1] = b[1];
363: t[2] = b[2];
364: t[3] = b[3];
365: t[4] = b[4];
366: i2 = 5;
367: idiag += 25;
368: for (i = 1; i < m; i++) {
369: v = aa + 25 * ai[i];
370: vi = aj + ai[i];
371: nz = diag[i] - ai[i];
372: s[0] = b[i2];
373: s[1] = b[i2 + 1];
374: s[2] = b[i2 + 2];
375: s[3] = b[i2 + 3];
376: s[4] = b[i2 + 4];
377: while (nz--) {
378: idx = 5 * (*vi++);
379: xw[0] = x[idx];
380: xw[1] = x[1 + idx];
381: xw[2] = x[2 + idx];
382: xw[3] = x[3 + idx];
383: xw[4] = x[4 + idx];
384: PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
385: v += 25;
386: }
387: t[i2] = s[0];
388: t[i2 + 1] = s[1];
389: t[i2 + 2] = s[2];
390: t[i2 + 3] = s[3];
391: t[i2 + 4] = s[4];
392: PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
393: x[i2] = xw[0];
394: x[i2 + 1] = xw[1];
395: x[i2 + 2] = xw[2];
396: x[i2 + 3] = xw[3];
397: x[i2 + 4] = xw[4];
398: idiag += 25;
399: i2 += 5;
400: }
401: break;
402: case 6:
403: PetscKernel_v_gets_A_times_w_6(x, idiag, b);
404: t[0] = b[0];
405: t[1] = b[1];
406: t[2] = b[2];
407: t[3] = b[3];
408: t[4] = b[4];
409: t[5] = b[5];
410: i2 = 6;
411: idiag += 36;
412: for (i = 1; i < m; i++) {
413: v = aa + 36 * ai[i];
414: vi = aj + ai[i];
415: nz = diag[i] - ai[i];
416: s[0] = b[i2];
417: s[1] = b[i2 + 1];
418: s[2] = b[i2 + 2];
419: s[3] = b[i2 + 3];
420: s[4] = b[i2 + 4];
421: s[5] = b[i2 + 5];
422: while (nz--) {
423: idx = 6 * (*vi++);
424: xw[0] = x[idx];
425: xw[1] = x[1 + idx];
426: xw[2] = x[2 + idx];
427: xw[3] = x[3 + idx];
428: xw[4] = x[4 + idx];
429: xw[5] = x[5 + idx];
430: PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
431: v += 36;
432: }
433: t[i2] = s[0];
434: t[i2 + 1] = s[1];
435: t[i2 + 2] = s[2];
436: t[i2 + 3] = s[3];
437: t[i2 + 4] = s[4];
438: t[i2 + 5] = s[5];
439: PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
440: x[i2] = xw[0];
441: x[i2 + 1] = xw[1];
442: x[i2 + 2] = xw[2];
443: x[i2 + 3] = xw[3];
444: x[i2 + 4] = xw[4];
445: x[i2 + 5] = xw[5];
446: idiag += 36;
447: i2 += 6;
448: }
449: break;
450: case 7:
451: PetscKernel_v_gets_A_times_w_7(x, idiag, b);
452: t[0] = b[0];
453: t[1] = b[1];
454: t[2] = b[2];
455: t[3] = b[3];
456: t[4] = b[4];
457: t[5] = b[5];
458: t[6] = b[6];
459: i2 = 7;
460: idiag += 49;
461: for (i = 1; i < m; i++) {
462: v = aa + 49 * ai[i];
463: vi = aj + ai[i];
464: nz = diag[i] - ai[i];
465: s[0] = b[i2];
466: s[1] = b[i2 + 1];
467: s[2] = b[i2 + 2];
468: s[3] = b[i2 + 3];
469: s[4] = b[i2 + 4];
470: s[5] = b[i2 + 5];
471: s[6] = b[i2 + 6];
472: while (nz--) {
473: idx = 7 * (*vi++);
474: xw[0] = x[idx];
475: xw[1] = x[1 + idx];
476: xw[2] = x[2 + idx];
477: xw[3] = x[3 + idx];
478: xw[4] = x[4 + idx];
479: xw[5] = x[5 + idx];
480: xw[6] = x[6 + idx];
481: PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
482: v += 49;
483: }
484: t[i2] = s[0];
485: t[i2 + 1] = s[1];
486: t[i2 + 2] = s[2];
487: t[i2 + 3] = s[3];
488: t[i2 + 4] = s[4];
489: t[i2 + 5] = s[5];
490: t[i2 + 6] = s[6];
491: PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
492: x[i2] = xw[0];
493: x[i2 + 1] = xw[1];
494: x[i2 + 2] = xw[2];
495: x[i2 + 3] = xw[3];
496: x[i2 + 4] = xw[4];
497: x[i2 + 5] = xw[5];
498: x[i2 + 6] = xw[6];
499: idiag += 49;
500: i2 += 7;
501: }
502: break;
503: default:
504: PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
505: PetscArraycpy(t, b, bs);
506: i2 = bs;
507: idiag += bs2;
508: for (i = 1; i < m; i++) {
509: v = aa + bs2 * ai[i];
510: vi = aj + ai[i];
511: nz = diag[i] - ai[i];
513: PetscArraycpy(w, b + i2, bs);
514: /* copy all rows of x that are needed into contiguous space */
515: workt = work;
516: for (j = 0; j < nz; j++) {
517: PetscArraycpy(workt, x + bs * (*vi++), bs);
518: workt += bs;
519: }
520: PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
521: PetscArraycpy(t + i2, w, bs);
522: PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
524: idiag += bs2;
525: i2 += bs;
526: }
527: break;
528: }
529: /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
530: PetscLogFlops(1.0 * bs2 * a->nz);
531: xb = t;
532: } else xb = b;
533: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
534: idiag = a->idiag + bs2 * (a->mbs - 1);
535: i2 = bs * (m - 1);
536: switch (bs) {
537: case 1:
538: s[0] = xb[i2];
539: PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
540: x[i2] = xw[0];
541: i2 -= 1;
542: for (i = m - 2; i >= 0; i--) {
543: v = aa + (diag[i] + 1);
544: vi = aj + diag[i] + 1;
545: nz = ai[i + 1] - diag[i] - 1;
546: s[0] = xb[i2];
547: for (j = 0; j < nz; j++) {
548: xw[0] = x[vi[j]];
549: PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
550: }
551: PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
552: x[i2] = xw[0];
553: idiag -= 1;
554: i2 -= 1;
555: }
556: break;
557: case 2:
558: s[0] = xb[i2];
559: s[1] = xb[i2 + 1];
560: PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
561: x[i2] = xw[0];
562: x[i2 + 1] = xw[1];
563: i2 -= 2;
564: idiag -= 4;
565: for (i = m - 2; i >= 0; i--) {
566: v = aa + 4 * (diag[i] + 1);
567: vi = aj + diag[i] + 1;
568: nz = ai[i + 1] - diag[i] - 1;
569: s[0] = xb[i2];
570: s[1] = xb[i2 + 1];
571: for (j = 0; j < nz; j++) {
572: idx = 2 * vi[j];
573: it = 4 * j;
574: xw[0] = x[idx];
575: xw[1] = x[1 + idx];
576: PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
577: }
578: PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
579: x[i2] = xw[0];
580: x[i2 + 1] = xw[1];
581: idiag -= 4;
582: i2 -= 2;
583: }
584: break;
585: case 3:
586: s[0] = xb[i2];
587: s[1] = xb[i2 + 1];
588: s[2] = xb[i2 + 2];
589: PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
590: x[i2] = xw[0];
591: x[i2 + 1] = xw[1];
592: x[i2 + 2] = xw[2];
593: i2 -= 3;
594: idiag -= 9;
595: for (i = m - 2; i >= 0; i--) {
596: v = aa + 9 * (diag[i] + 1);
597: vi = aj + diag[i] + 1;
598: nz = ai[i + 1] - diag[i] - 1;
599: s[0] = xb[i2];
600: s[1] = xb[i2 + 1];
601: s[2] = xb[i2 + 2];
602: while (nz--) {
603: idx = 3 * (*vi++);
604: xw[0] = x[idx];
605: xw[1] = x[1 + idx];
606: xw[2] = x[2 + idx];
607: PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
608: v += 9;
609: }
610: PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
611: x[i2] = xw[0];
612: x[i2 + 1] = xw[1];
613: x[i2 + 2] = xw[2];
614: idiag -= 9;
615: i2 -= 3;
616: }
617: break;
618: case 4:
619: s[0] = xb[i2];
620: s[1] = xb[i2 + 1];
621: s[2] = xb[i2 + 2];
622: s[3] = xb[i2 + 3];
623: PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
624: x[i2] = xw[0];
625: x[i2 + 1] = xw[1];
626: x[i2 + 2] = xw[2];
627: x[i2 + 3] = xw[3];
628: i2 -= 4;
629: idiag -= 16;
630: for (i = m - 2; i >= 0; i--) {
631: v = aa + 16 * (diag[i] + 1);
632: vi = aj + diag[i] + 1;
633: nz = ai[i + 1] - diag[i] - 1;
634: s[0] = xb[i2];
635: s[1] = xb[i2 + 1];
636: s[2] = xb[i2 + 2];
637: s[3] = xb[i2 + 3];
638: while (nz--) {
639: idx = 4 * (*vi++);
640: xw[0] = x[idx];
641: xw[1] = x[1 + idx];
642: xw[2] = x[2 + idx];
643: xw[3] = x[3 + idx];
644: PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
645: v += 16;
646: }
647: PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
648: x[i2] = xw[0];
649: x[i2 + 1] = xw[1];
650: x[i2 + 2] = xw[2];
651: x[i2 + 3] = xw[3];
652: idiag -= 16;
653: i2 -= 4;
654: }
655: break;
656: case 5:
657: s[0] = xb[i2];
658: s[1] = xb[i2 + 1];
659: s[2] = xb[i2 + 2];
660: s[3] = xb[i2 + 3];
661: s[4] = xb[i2 + 4];
662: PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
663: x[i2] = xw[0];
664: x[i2 + 1] = xw[1];
665: x[i2 + 2] = xw[2];
666: x[i2 + 3] = xw[3];
667: x[i2 + 4] = xw[4];
668: i2 -= 5;
669: idiag -= 25;
670: for (i = m - 2; i >= 0; i--) {
671: v = aa + 25 * (diag[i] + 1);
672: vi = aj + diag[i] + 1;
673: nz = ai[i + 1] - diag[i] - 1;
674: s[0] = xb[i2];
675: s[1] = xb[i2 + 1];
676: s[2] = xb[i2 + 2];
677: s[3] = xb[i2 + 3];
678: s[4] = xb[i2 + 4];
679: while (nz--) {
680: idx = 5 * (*vi++);
681: xw[0] = x[idx];
682: xw[1] = x[1 + idx];
683: xw[2] = x[2 + idx];
684: xw[3] = x[3 + idx];
685: xw[4] = x[4 + idx];
686: PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
687: v += 25;
688: }
689: PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
690: x[i2] = xw[0];
691: x[i2 + 1] = xw[1];
692: x[i2 + 2] = xw[2];
693: x[i2 + 3] = xw[3];
694: x[i2 + 4] = xw[4];
695: idiag -= 25;
696: i2 -= 5;
697: }
698: break;
699: case 6:
700: s[0] = xb[i2];
701: s[1] = xb[i2 + 1];
702: s[2] = xb[i2 + 2];
703: s[3] = xb[i2 + 3];
704: s[4] = xb[i2 + 4];
705: s[5] = xb[i2 + 5];
706: PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
707: x[i2] = xw[0];
708: x[i2 + 1] = xw[1];
709: x[i2 + 2] = xw[2];
710: x[i2 + 3] = xw[3];
711: x[i2 + 4] = xw[4];
712: x[i2 + 5] = xw[5];
713: i2 -= 6;
714: idiag -= 36;
715: for (i = m - 2; i >= 0; i--) {
716: v = aa + 36 * (diag[i] + 1);
717: vi = aj + diag[i] + 1;
718: nz = ai[i + 1] - diag[i] - 1;
719: s[0] = xb[i2];
720: s[1] = xb[i2 + 1];
721: s[2] = xb[i2 + 2];
722: s[3] = xb[i2 + 3];
723: s[4] = xb[i2 + 4];
724: s[5] = xb[i2 + 5];
725: while (nz--) {
726: idx = 6 * (*vi++);
727: xw[0] = x[idx];
728: xw[1] = x[1 + idx];
729: xw[2] = x[2 + idx];
730: xw[3] = x[3 + idx];
731: xw[4] = x[4 + idx];
732: xw[5] = x[5 + idx];
733: PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
734: v += 36;
735: }
736: PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
737: x[i2] = xw[0];
738: x[i2 + 1] = xw[1];
739: x[i2 + 2] = xw[2];
740: x[i2 + 3] = xw[3];
741: x[i2 + 4] = xw[4];
742: x[i2 + 5] = xw[5];
743: idiag -= 36;
744: i2 -= 6;
745: }
746: break;
747: case 7:
748: s[0] = xb[i2];
749: s[1] = xb[i2 + 1];
750: s[2] = xb[i2 + 2];
751: s[3] = xb[i2 + 3];
752: s[4] = xb[i2 + 4];
753: s[5] = xb[i2 + 5];
754: s[6] = xb[i2 + 6];
755: PetscKernel_v_gets_A_times_w_7(x, idiag, b);
756: x[i2] = xw[0];
757: x[i2 + 1] = xw[1];
758: x[i2 + 2] = xw[2];
759: x[i2 + 3] = xw[3];
760: x[i2 + 4] = xw[4];
761: x[i2 + 5] = xw[5];
762: x[i2 + 6] = xw[6];
763: i2 -= 7;
764: idiag -= 49;
765: for (i = m - 2; i >= 0; i--) {
766: v = aa + 49 * (diag[i] + 1);
767: vi = aj + diag[i] + 1;
768: nz = ai[i + 1] - diag[i] - 1;
769: s[0] = xb[i2];
770: s[1] = xb[i2 + 1];
771: s[2] = xb[i2 + 2];
772: s[3] = xb[i2 + 3];
773: s[4] = xb[i2 + 4];
774: s[5] = xb[i2 + 5];
775: s[6] = xb[i2 + 6];
776: while (nz--) {
777: idx = 7 * (*vi++);
778: xw[0] = x[idx];
779: xw[1] = x[1 + idx];
780: xw[2] = x[2 + idx];
781: xw[3] = x[3 + idx];
782: xw[4] = x[4 + idx];
783: xw[5] = x[5 + idx];
784: xw[6] = x[6 + idx];
785: PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
786: v += 49;
787: }
788: PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
789: x[i2] = xw[0];
790: x[i2 + 1] = xw[1];
791: x[i2 + 2] = xw[2];
792: x[i2 + 3] = xw[3];
793: x[i2 + 4] = xw[4];
794: x[i2 + 5] = xw[5];
795: x[i2 + 6] = xw[6];
796: idiag -= 49;
797: i2 -= 7;
798: }
799: break;
800: default:
801: PetscArraycpy(w, xb + i2, bs);
802: PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
803: i2 -= bs;
804: idiag -= bs2;
805: for (i = m - 2; i >= 0; i--) {
806: v = aa + bs2 * (diag[i] + 1);
807: vi = aj + diag[i] + 1;
808: nz = ai[i + 1] - diag[i] - 1;
810: PetscArraycpy(w, xb + i2, bs);
811: /* copy all rows of x that are needed into contiguous space */
812: workt = work;
813: for (j = 0; j < nz; j++) {
814: PetscArraycpy(workt, x + bs * (*vi++), bs);
815: workt += bs;
816: }
817: PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
818: PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
820: idiag -= bs2;
821: i2 -= bs;
822: }
823: break;
824: }
825: PetscLogFlops(1.0 * bs2 * (a->nz));
826: }
827: its--;
828: }
829: while (its--) {
830: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
831: idiag = a->idiag;
832: i2 = 0;
833: switch (bs) {
834: case 1:
835: for (i = 0; i < m; i++) {
836: v = aa + ai[i];
837: vi = aj + ai[i];
838: nz = ai[i + 1] - ai[i];
839: s[0] = b[i2];
840: for (j = 0; j < nz; j++) {
841: xw[0] = x[vi[j]];
842: PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
843: }
844: PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
845: x[i2] += xw[0];
846: idiag += 1;
847: i2 += 1;
848: }
849: break;
850: case 2:
851: for (i = 0; i < m; i++) {
852: v = aa + 4 * ai[i];
853: vi = aj + ai[i];
854: nz = ai[i + 1] - ai[i];
855: s[0] = b[i2];
856: s[1] = b[i2 + 1];
857: for (j = 0; j < nz; j++) {
858: idx = 2 * vi[j];
859: it = 4 * j;
860: xw[0] = x[idx];
861: xw[1] = x[1 + idx];
862: PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
863: }
864: PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
865: x[i2] += xw[0];
866: x[i2 + 1] += xw[1];
867: idiag += 4;
868: i2 += 2;
869: }
870: break;
871: case 3:
872: for (i = 0; i < m; i++) {
873: v = aa + 9 * ai[i];
874: vi = aj + ai[i];
875: nz = ai[i + 1] - ai[i];
876: s[0] = b[i2];
877: s[1] = b[i2 + 1];
878: s[2] = b[i2 + 2];
879: while (nz--) {
880: idx = 3 * (*vi++);
881: xw[0] = x[idx];
882: xw[1] = x[1 + idx];
883: xw[2] = x[2 + idx];
884: PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
885: v += 9;
886: }
887: PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
888: x[i2] += xw[0];
889: x[i2 + 1] += xw[1];
890: x[i2 + 2] += xw[2];
891: idiag += 9;
892: i2 += 3;
893: }
894: break;
895: case 4:
896: for (i = 0; i < m; i++) {
897: v = aa + 16 * ai[i];
898: vi = aj + ai[i];
899: nz = ai[i + 1] - ai[i];
900: s[0] = b[i2];
901: s[1] = b[i2 + 1];
902: s[2] = b[i2 + 2];
903: s[3] = b[i2 + 3];
904: while (nz--) {
905: idx = 4 * (*vi++);
906: xw[0] = x[idx];
907: xw[1] = x[1 + idx];
908: xw[2] = x[2 + idx];
909: xw[3] = x[3 + idx];
910: PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
911: v += 16;
912: }
913: PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
914: x[i2] += xw[0];
915: x[i2 + 1] += xw[1];
916: x[i2 + 2] += xw[2];
917: x[i2 + 3] += xw[3];
918: idiag += 16;
919: i2 += 4;
920: }
921: break;
922: case 5:
923: for (i = 0; i < m; i++) {
924: v = aa + 25 * ai[i];
925: vi = aj + ai[i];
926: nz = ai[i + 1] - ai[i];
927: s[0] = b[i2];
928: s[1] = b[i2 + 1];
929: s[2] = b[i2 + 2];
930: s[3] = b[i2 + 3];
931: s[4] = b[i2 + 4];
932: while (nz--) {
933: idx = 5 * (*vi++);
934: xw[0] = x[idx];
935: xw[1] = x[1 + idx];
936: xw[2] = x[2 + idx];
937: xw[3] = x[3 + idx];
938: xw[4] = x[4 + idx];
939: PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
940: v += 25;
941: }
942: PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
943: x[i2] += xw[0];
944: x[i2 + 1] += xw[1];
945: x[i2 + 2] += xw[2];
946: x[i2 + 3] += xw[3];
947: x[i2 + 4] += xw[4];
948: idiag += 25;
949: i2 += 5;
950: }
951: break;
952: case 6:
953: for (i = 0; i < m; i++) {
954: v = aa + 36 * ai[i];
955: vi = aj + ai[i];
956: nz = ai[i + 1] - ai[i];
957: s[0] = b[i2];
958: s[1] = b[i2 + 1];
959: s[2] = b[i2 + 2];
960: s[3] = b[i2 + 3];
961: s[4] = b[i2 + 4];
962: s[5] = b[i2 + 5];
963: while (nz--) {
964: idx = 6 * (*vi++);
965: xw[0] = x[idx];
966: xw[1] = x[1 + idx];
967: xw[2] = x[2 + idx];
968: xw[3] = x[3 + idx];
969: xw[4] = x[4 + idx];
970: xw[5] = x[5 + idx];
971: PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
972: v += 36;
973: }
974: PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
975: x[i2] += xw[0];
976: x[i2 + 1] += xw[1];
977: x[i2 + 2] += xw[2];
978: x[i2 + 3] += xw[3];
979: x[i2 + 4] += xw[4];
980: x[i2 + 5] += xw[5];
981: idiag += 36;
982: i2 += 6;
983: }
984: break;
985: case 7:
986: for (i = 0; i < m; i++) {
987: v = aa + 49 * ai[i];
988: vi = aj + ai[i];
989: nz = ai[i + 1] - ai[i];
990: s[0] = b[i2];
991: s[1] = b[i2 + 1];
992: s[2] = b[i2 + 2];
993: s[3] = b[i2 + 3];
994: s[4] = b[i2 + 4];
995: s[5] = b[i2 + 5];
996: s[6] = b[i2 + 6];
997: while (nz--) {
998: idx = 7 * (*vi++);
999: xw[0] = x[idx];
1000: xw[1] = x[1 + idx];
1001: xw[2] = x[2 + idx];
1002: xw[3] = x[3 + idx];
1003: xw[4] = x[4 + idx];
1004: xw[5] = x[5 + idx];
1005: xw[6] = x[6 + idx];
1006: PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1007: v += 49;
1008: }
1009: PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1010: x[i2] += xw[0];
1011: x[i2 + 1] += xw[1];
1012: x[i2 + 2] += xw[2];
1013: x[i2 + 3] += xw[3];
1014: x[i2 + 4] += xw[4];
1015: x[i2 + 5] += xw[5];
1016: x[i2 + 6] += xw[6];
1017: idiag += 49;
1018: i2 += 7;
1019: }
1020: break;
1021: default:
1022: for (i = 0; i < m; i++) {
1023: v = aa + bs2 * ai[i];
1024: vi = aj + ai[i];
1025: nz = ai[i + 1] - ai[i];
1027: PetscArraycpy(w, b + i2, bs);
1028: /* copy all rows of x that are needed into contiguous space */
1029: workt = work;
1030: for (j = 0; j < nz; j++) {
1031: PetscArraycpy(workt, x + bs * (*vi++), bs);
1032: workt += bs;
1033: }
1034: PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1035: PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1037: idiag += bs2;
1038: i2 += bs;
1039: }
1040: break;
1041: }
1042: PetscLogFlops(2.0 * bs2 * a->nz);
1043: }
1044: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1045: idiag = a->idiag + bs2 * (a->mbs - 1);
1046: i2 = bs * (m - 1);
1047: switch (bs) {
1048: case 1:
1049: for (i = m - 1; i >= 0; i--) {
1050: v = aa + ai[i];
1051: vi = aj + ai[i];
1052: nz = ai[i + 1] - ai[i];
1053: s[0] = b[i2];
1054: for (j = 0; j < nz; j++) {
1055: xw[0] = x[vi[j]];
1056: PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1057: }
1058: PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1059: x[i2] += xw[0];
1060: idiag -= 1;
1061: i2 -= 1;
1062: }
1063: break;
1064: case 2:
1065: for (i = m - 1; i >= 0; i--) {
1066: v = aa + 4 * ai[i];
1067: vi = aj + ai[i];
1068: nz = ai[i + 1] - ai[i];
1069: s[0] = b[i2];
1070: s[1] = b[i2 + 1];
1071: for (j = 0; j < nz; j++) {
1072: idx = 2 * vi[j];
1073: it = 4 * j;
1074: xw[0] = x[idx];
1075: xw[1] = x[1 + idx];
1076: PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1077: }
1078: PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1079: x[i2] += xw[0];
1080: x[i2 + 1] += xw[1];
1081: idiag -= 4;
1082: i2 -= 2;
1083: }
1084: break;
1085: case 3:
1086: for (i = m - 1; i >= 0; i--) {
1087: v = aa + 9 * ai[i];
1088: vi = aj + ai[i];
1089: nz = ai[i + 1] - ai[i];
1090: s[0] = b[i2];
1091: s[1] = b[i2 + 1];
1092: s[2] = b[i2 + 2];
1093: while (nz--) {
1094: idx = 3 * (*vi++);
1095: xw[0] = x[idx];
1096: xw[1] = x[1 + idx];
1097: xw[2] = x[2 + idx];
1098: PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1099: v += 9;
1100: }
1101: PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1102: x[i2] += xw[0];
1103: x[i2 + 1] += xw[1];
1104: x[i2 + 2] += xw[2];
1105: idiag -= 9;
1106: i2 -= 3;
1107: }
1108: break;
1109: case 4:
1110: for (i = m - 1; i >= 0; i--) {
1111: v = aa + 16 * ai[i];
1112: vi = aj + ai[i];
1113: nz = ai[i + 1] - ai[i];
1114: s[0] = b[i2];
1115: s[1] = b[i2 + 1];
1116: s[2] = b[i2 + 2];
1117: s[3] = b[i2 + 3];
1118: while (nz--) {
1119: idx = 4 * (*vi++);
1120: xw[0] = x[idx];
1121: xw[1] = x[1 + idx];
1122: xw[2] = x[2 + idx];
1123: xw[3] = x[3 + idx];
1124: PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1125: v += 16;
1126: }
1127: PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1128: x[i2] += xw[0];
1129: x[i2 + 1] += xw[1];
1130: x[i2 + 2] += xw[2];
1131: x[i2 + 3] += xw[3];
1132: idiag -= 16;
1133: i2 -= 4;
1134: }
1135: break;
1136: case 5:
1137: for (i = m - 1; i >= 0; i--) {
1138: v = aa + 25 * ai[i];
1139: vi = aj + ai[i];
1140: nz = ai[i + 1] - ai[i];
1141: s[0] = b[i2];
1142: s[1] = b[i2 + 1];
1143: s[2] = b[i2 + 2];
1144: s[3] = b[i2 + 3];
1145: s[4] = b[i2 + 4];
1146: while (nz--) {
1147: idx = 5 * (*vi++);
1148: xw[0] = x[idx];
1149: xw[1] = x[1 + idx];
1150: xw[2] = x[2 + idx];
1151: xw[3] = x[3 + idx];
1152: xw[4] = x[4 + idx];
1153: PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1154: v += 25;
1155: }
1156: PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1157: x[i2] += xw[0];
1158: x[i2 + 1] += xw[1];
1159: x[i2 + 2] += xw[2];
1160: x[i2 + 3] += xw[3];
1161: x[i2 + 4] += xw[4];
1162: idiag -= 25;
1163: i2 -= 5;
1164: }
1165: break;
1166: case 6:
1167: for (i = m - 1; i >= 0; i--) {
1168: v = aa + 36 * ai[i];
1169: vi = aj + ai[i];
1170: nz = ai[i + 1] - ai[i];
1171: s[0] = b[i2];
1172: s[1] = b[i2 + 1];
1173: s[2] = b[i2 + 2];
1174: s[3] = b[i2 + 3];
1175: s[4] = b[i2 + 4];
1176: s[5] = b[i2 + 5];
1177: while (nz--) {
1178: idx = 6 * (*vi++);
1179: xw[0] = x[idx];
1180: xw[1] = x[1 + idx];
1181: xw[2] = x[2 + idx];
1182: xw[3] = x[3 + idx];
1183: xw[4] = x[4 + idx];
1184: xw[5] = x[5 + idx];
1185: PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1186: v += 36;
1187: }
1188: PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1189: x[i2] += xw[0];
1190: x[i2 + 1] += xw[1];
1191: x[i2 + 2] += xw[2];
1192: x[i2 + 3] += xw[3];
1193: x[i2 + 4] += xw[4];
1194: x[i2 + 5] += xw[5];
1195: idiag -= 36;
1196: i2 -= 6;
1197: }
1198: break;
1199: case 7:
1200: for (i = m - 1; i >= 0; i--) {
1201: v = aa + 49 * ai[i];
1202: vi = aj + ai[i];
1203: nz = ai[i + 1] - ai[i];
1204: s[0] = b[i2];
1205: s[1] = b[i2 + 1];
1206: s[2] = b[i2 + 2];
1207: s[3] = b[i2 + 3];
1208: s[4] = b[i2 + 4];
1209: s[5] = b[i2 + 5];
1210: s[6] = b[i2 + 6];
1211: while (nz--) {
1212: idx = 7 * (*vi++);
1213: xw[0] = x[idx];
1214: xw[1] = x[1 + idx];
1215: xw[2] = x[2 + idx];
1216: xw[3] = x[3 + idx];
1217: xw[4] = x[4 + idx];
1218: xw[5] = x[5 + idx];
1219: xw[6] = x[6 + idx];
1220: PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1221: v += 49;
1222: }
1223: PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1224: x[i2] += xw[0];
1225: x[i2 + 1] += xw[1];
1226: x[i2 + 2] += xw[2];
1227: x[i2 + 3] += xw[3];
1228: x[i2 + 4] += xw[4];
1229: x[i2 + 5] += xw[5];
1230: x[i2 + 6] += xw[6];
1231: idiag -= 49;
1232: i2 -= 7;
1233: }
1234: break;
1235: default:
1236: for (i = m - 1; i >= 0; i--) {
1237: v = aa + bs2 * ai[i];
1238: vi = aj + ai[i];
1239: nz = ai[i + 1] - ai[i];
1241: PetscArraycpy(w, b + i2, bs);
1242: /* copy all rows of x that are needed into contiguous space */
1243: workt = work;
1244: for (j = 0; j < nz; j++) {
1245: PetscArraycpy(workt, x + bs * (*vi++), bs);
1246: workt += bs;
1247: }
1248: PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1249: PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1251: idiag -= bs2;
1252: i2 -= bs;
1253: }
1254: break;
1255: }
1256: PetscLogFlops(2.0 * bs2 * (a->nz));
1257: }
1258: }
1259: VecRestoreArray(xx, &x);
1260: VecRestoreArrayRead(bb, &b);
1261: return 0;
1262: }
1264: /*
1265: Special version for direct calls from Fortran (Used in PETSc-fun3d)
1266: */
1267: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1268: #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1269: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1270: #define matsetvaluesblocked4_ matsetvaluesblocked4
1271: #endif
1273: PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[])
1274: {
1275: Mat A = *AA;
1276: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1277: PetscInt *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1278: PetscInt *ai = a->i, *ailen = a->ilen;
1279: PetscInt *aj = a->j, stepval, lastcol = -1;
1280: const PetscScalar *value = v;
1281: MatScalar *ap, *aa = a->a, *bap;
1283: if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1284: stepval = (n - 1) * 4;
1285: for (k = 0; k < m; k++) { /* loop over added rows */
1286: row = im[k];
1287: rp = aj + ai[row];
1288: ap = aa + 16 * ai[row];
1289: nrow = ailen[row];
1290: low = 0;
1291: high = nrow;
1292: for (l = 0; l < n; l++) { /* loop over added columns */
1293: col = in[l];
1294: if (col <= lastcol) low = 0;
1295: else high = nrow;
1296: lastcol = col;
1297: value = v + k * (stepval + 4 + l) * 4;
1298: while (high - low > 7) {
1299: t = (low + high) / 2;
1300: if (rp[t] > col) high = t;
1301: else low = t;
1302: }
1303: for (i = low; i < high; i++) {
1304: if (rp[i] > col) break;
1305: if (rp[i] == col) {
1306: bap = ap + 16 * i;
1307: for (ii = 0; ii < 4; ii++, value += stepval) {
1308: for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1309: }
1310: goto noinsert2;
1311: }
1312: }
1313: N = nrow++ - 1;
1314: high++; /* added new column index thus must search to one higher than before */
1315: /* shift up all the later entries in this row */
1316: for (ii = N; ii >= i; ii--) {
1317: rp[ii + 1] = rp[ii];
1318: PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16);
1319: }
1320: if (N >= i) PetscArrayzero(ap + 16 * i, 16);
1321: rp[i] = col;
1322: bap = ap + 16 * i;
1323: for (ii = 0; ii < 4; ii++, value += stepval) {
1324: for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1325: }
1326: noinsert2:;
1327: low = i;
1328: }
1329: ailen[row] = nrow;
1330: }
1331: return;
1332: }
1334: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1335: #define matsetvalues4_ MATSETVALUES4
1336: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1337: #define matsetvalues4_ matsetvalues4
1338: #endif
1340: PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v)
1341: {
1342: Mat A = *AA;
1343: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1344: PetscInt *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1345: PetscInt *ai = a->i, *ailen = a->ilen;
1346: PetscInt *aj = a->j, brow, bcol;
1347: PetscInt ridx, cidx, lastcol = -1;
1348: MatScalar *ap, value, *aa = a->a, *bap;
1350: for (k = 0; k < m; k++) { /* loop over added rows */
1351: row = im[k];
1352: brow = row / 4;
1353: rp = aj + ai[brow];
1354: ap = aa + 16 * ai[brow];
1355: nrow = ailen[brow];
1356: low = 0;
1357: high = nrow;
1358: for (l = 0; l < n; l++) { /* loop over added columns */
1359: col = in[l];
1360: bcol = col / 4;
1361: ridx = row % 4;
1362: cidx = col % 4;
1363: value = v[l + k * n];
1364: if (col <= lastcol) low = 0;
1365: else high = nrow;
1366: lastcol = col;
1367: while (high - low > 7) {
1368: t = (low + high) / 2;
1369: if (rp[t] > bcol) high = t;
1370: else low = t;
1371: }
1372: for (i = low; i < high; i++) {
1373: if (rp[i] > bcol) break;
1374: if (rp[i] == bcol) {
1375: bap = ap + 16 * i + 4 * cidx + ridx;
1376: *bap += value;
1377: goto noinsert1;
1378: }
1379: }
1380: N = nrow++ - 1;
1381: high++; /* added new column thus must search to one higher than before */
1382: /* shift up all the later entries in this row */
1383: PetscArraymove(rp + i + 1, rp + i, N - i + 1);
1384: PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1));
1385: PetscArrayzero(ap + 16 * i, 16);
1386: rp[i] = bcol;
1387: ap[16 * i + 4 * cidx + ridx] = value;
1388: noinsert1:;
1389: low = i;
1390: }
1391: ailen[brow] = nrow;
1392: }
1393: return;
1394: }
1396: /*
1397: Checks for missing diagonals
1398: */
1399: PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d)
1400: {
1401: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1402: PetscInt *diag, *ii = a->i, i;
1404: MatMarkDiagonal_SeqBAIJ(A);
1405: *missing = PETSC_FALSE;
1406: if (A->rmap->n > 0 && !ii) {
1407: *missing = PETSC_TRUE;
1408: if (d) *d = 0;
1409: PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n");
1410: } else {
1411: PetscInt n;
1412: n = PetscMin(a->mbs, a->nbs);
1413: diag = a->diag;
1414: for (i = 0; i < n; i++) {
1415: if (diag[i] >= ii[i + 1]) {
1416: *missing = PETSC_TRUE;
1417: if (d) *d = i;
1418: PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i);
1419: break;
1420: }
1421: }
1422: }
1423: return 0;
1424: }
1426: PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A)
1427: {
1428: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1429: PetscInt i, j, m = a->mbs;
1431: if (!a->diag) {
1432: PetscMalloc1(m, &a->diag);
1433: a->free_diag = PETSC_TRUE;
1434: }
1435: for (i = 0; i < m; i++) {
1436: a->diag[i] = a->i[i + 1];
1437: for (j = a->i[i]; j < a->i[i + 1]; j++) {
1438: if (a->j[j] == i) {
1439: a->diag[i] = j;
1440: break;
1441: }
1442: }
1443: }
1444: return 0;
1445: }
1447: static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done)
1448: {
1449: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1450: PetscInt i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1451: PetscInt **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;
1453: *nn = n;
1454: if (!ia) return 0;
1455: if (symmetric) {
1456: MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja);
1457: nz = tia[n];
1458: } else {
1459: tia = a->i;
1460: tja = a->j;
1461: }
1463: if (!blockcompressed && bs > 1) {
1464: (*nn) *= bs;
1465: /* malloc & create the natural set of indices */
1466: PetscMalloc1((n + 1) * bs, ia);
1467: if (n) {
1468: (*ia)[0] = oshift;
1469: for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1470: }
1472: for (i = 1; i < n; i++) {
1473: (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1474: for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1475: }
1476: if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];
1478: if (inja) {
1479: PetscMalloc1(nz * bs * bs, ja);
1480: cnt = 0;
1481: for (i = 0; i < n; i++) {
1482: for (j = 0; j < bs; j++) {
1483: for (k = tia[i]; k < tia[i + 1]; k++) {
1484: for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1485: }
1486: }
1487: }
1488: }
1490: if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1491: PetscFree(tia);
1492: PetscFree(tja);
1493: }
1494: } else if (oshift == 1) {
1495: if (symmetric) {
1496: nz = tia[A->rmap->n / bs];
1497: /* add 1 to i and j indices */
1498: for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1499: *ia = tia;
1500: if (ja) {
1501: for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1502: *ja = tja;
1503: }
1504: } else {
1505: nz = a->i[A->rmap->n / bs];
1506: /* malloc space and add 1 to i and j indices */
1507: PetscMalloc1(A->rmap->n / bs + 1, ia);
1508: for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1509: if (ja) {
1510: PetscMalloc1(nz, ja);
1511: for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1512: }
1513: }
1514: } else {
1515: *ia = tia;
1516: if (ja) *ja = tja;
1517: }
1518: return 0;
1519: }
1521: static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
1522: {
1523: if (!ia) return 0;
1524: if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1525: PetscFree(*ia);
1526: if (ja) PetscFree(*ja);
1527: }
1528: return 0;
1529: }
1531: PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1532: {
1533: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1535: #if defined(PETSC_USE_LOG)
1536: PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz);
1537: #endif
1538: MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
1539: ISDestroy(&a->row);
1540: ISDestroy(&a->col);
1541: if (a->free_diag) PetscFree(a->diag);
1542: PetscFree(a->idiag);
1543: if (a->free_imax_ilen) PetscFree2(a->imax, a->ilen);
1544: PetscFree(a->solve_work);
1545: PetscFree(a->mult_work);
1546: PetscFree(a->sor_workt);
1547: PetscFree(a->sor_work);
1548: ISDestroy(&a->icol);
1549: PetscFree(a->saved_values);
1550: PetscFree2(a->compressedrow.i, a->compressedrow.rindex);
1552: MatDestroy(&a->sbaijMat);
1553: MatDestroy(&a->parent);
1554: PetscFree(A->data);
1556: PetscObjectChangeTypeName((PetscObject)A, NULL);
1557: PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL);
1558: PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL);
1559: PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL);
1560: PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL);
1561: PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL);
1562: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL);
1563: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL);
1564: PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL);
1565: PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL);
1566: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL);
1567: PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL);
1568: #if defined(PETSC_HAVE_HYPRE)
1569: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL);
1570: #endif
1571: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL);
1572: PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
1573: return 0;
1574: }
1576: PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg)
1577: {
1578: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1580: switch (op) {
1581: case MAT_ROW_ORIENTED:
1582: a->roworiented = flg;
1583: break;
1584: case MAT_KEEP_NONZERO_PATTERN:
1585: a->keepnonzeropattern = flg;
1586: break;
1587: case MAT_NEW_NONZERO_LOCATIONS:
1588: a->nonew = (flg ? 0 : 1);
1589: break;
1590: case MAT_NEW_NONZERO_LOCATION_ERR:
1591: a->nonew = (flg ? -1 : 0);
1592: break;
1593: case MAT_NEW_NONZERO_ALLOCATION_ERR:
1594: a->nonew = (flg ? -2 : 0);
1595: break;
1596: case MAT_UNUSED_NONZERO_LOCATION_ERR:
1597: a->nounused = (flg ? -1 : 0);
1598: break;
1599: case MAT_FORCE_DIAGONAL_ENTRIES:
1600: case MAT_IGNORE_OFF_PROC_ENTRIES:
1601: case MAT_USE_HASH_TABLE:
1602: case MAT_SORTED_FULL:
1603: PetscInfo(A, "Option %s ignored\n", MatOptions[op]);
1604: break;
1605: case MAT_SPD:
1606: case MAT_SYMMETRIC:
1607: case MAT_STRUCTURALLY_SYMMETRIC:
1608: case MAT_HERMITIAN:
1609: case MAT_SYMMETRY_ETERNAL:
1610: case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1611: case MAT_SUBMAT_SINGLEIS:
1612: case MAT_STRUCTURE_ONLY:
1613: case MAT_SPD_ETERNAL:
1614: /* if the diagonal matrix is square it inherits some of the properties above */
1615: break;
1616: default:
1617: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1618: }
1619: return 0;
1620: }
1622: /* used for both SeqBAIJ and SeqSBAIJ matrices */
1623: PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa)
1624: {
1625: PetscInt itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1626: MatScalar *aa_i;
1627: PetscScalar *v_i;
1629: bs = A->rmap->bs;
1630: bs2 = bs * bs;
1633: bn = row / bs; /* Block number */
1634: bp = row % bs; /* Block Position */
1635: M = ai[bn + 1] - ai[bn];
1636: *nz = bs * M;
1638: if (v) {
1639: *v = NULL;
1640: if (*nz) {
1641: PetscMalloc1(*nz, v);
1642: for (i = 0; i < M; i++) { /* for each block in the block row */
1643: v_i = *v + i * bs;
1644: aa_i = aa + bs2 * (ai[bn] + i);
1645: for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1646: }
1647: }
1648: }
1650: if (idx) {
1651: *idx = NULL;
1652: if (*nz) {
1653: PetscMalloc1(*nz, idx);
1654: for (i = 0; i < M; i++) { /* for each block in the block row */
1655: idx_i = *idx + i * bs;
1656: itmp = bs * aj[ai[bn] + i];
1657: for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1658: }
1659: }
1660: }
1661: return 0;
1662: }
1664: PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1665: {
1666: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1668: MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a);
1669: return 0;
1670: }
1672: PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1673: {
1674: if (nz) *nz = 0;
1675: if (idx) PetscFree(*idx);
1676: if (v) PetscFree(*v);
1677: return 0;
1678: }
1680: PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B)
1681: {
1682: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1683: Mat C;
1684: PetscInt i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1685: PetscInt bs2 = a->bs2, *ati, *atj, anzj, kr;
1686: MatScalar *ata, *aa = a->a;
1688: if (reuse == MAT_REUSE_MATRIX) MatTransposeCheckNonzeroState_Private(A, *B);
1689: PetscCalloc1(1 + nbs, &atfill);
1690: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1691: for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */
1693: MatCreate(PetscObjectComm((PetscObject)A), &C);
1694: MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N);
1695: MatSetType(C, ((PetscObject)A)->type_name);
1696: MatSeqBAIJSetPreallocation(C, bs, 0, atfill);
1698: at = (Mat_SeqBAIJ *)C->data;
1699: ati = at->i;
1700: for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1701: } else {
1702: C = *B;
1703: at = (Mat_SeqBAIJ *)C->data;
1704: ati = at->i;
1705: }
1707: atj = at->j;
1708: ata = at->a;
1710: /* Copy ati into atfill so we have locations of the next free space in atj */
1711: PetscArraycpy(atfill, ati, nbs);
1713: /* Walk through A row-wise and mark nonzero entries of A^T. */
1714: for (i = 0; i < mbs; i++) {
1715: anzj = ai[i + 1] - ai[i];
1716: for (j = 0; j < anzj; j++) {
1717: atj[atfill[*aj]] = i;
1718: for (kr = 0; kr < bs; kr++) {
1719: for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1720: }
1721: atfill[*aj++] += 1;
1722: }
1723: }
1724: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
1725: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
1727: /* Clean up temporary space and complete requests. */
1728: PetscFree(atfill);
1730: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1731: MatSetBlockSizes(C, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs));
1732: *B = C;
1733: } else {
1734: MatHeaderMerge(A, &C);
1735: }
1736: return 0;
1737: }
1739: PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f)
1740: {
1741: Mat Btrans;
1743: *f = PETSC_FALSE;
1744: MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans);
1745: MatEqual_SeqBAIJ(B, Btrans, f);
1746: MatDestroy(&Btrans);
1747: return 0;
1748: }
1750: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1751: PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
1752: {
1753: Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1754: PetscInt header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1755: PetscInt *rowlens, *colidxs;
1756: PetscScalar *matvals;
1758: PetscViewerSetUp(viewer);
1760: M = mat->rmap->N;
1761: N = mat->cmap->N;
1762: m = mat->rmap->n;
1763: bs = mat->rmap->bs;
1764: nz = bs * bs * A->nz;
1766: /* write matrix header */
1767: header[0] = MAT_FILE_CLASSID;
1768: header[1] = M;
1769: header[2] = N;
1770: header[3] = nz;
1771: PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT);
1773: /* store row lengths */
1774: PetscMalloc1(m, &rowlens);
1775: for (cnt = 0, i = 0; i < A->mbs; i++)
1776: for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1777: PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT);
1778: PetscFree(rowlens);
1780: /* store column indices */
1781: PetscMalloc1(nz, &colidxs);
1782: for (cnt = 0, i = 0; i < A->mbs; i++)
1783: for (k = 0; k < bs; k++)
1784: for (j = A->i[i]; j < A->i[i + 1]; j++)
1785: for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1787: PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT);
1788: PetscFree(colidxs);
1790: /* store nonzero values */
1791: PetscMalloc1(nz, &matvals);
1792: for (cnt = 0, i = 0; i < A->mbs; i++)
1793: for (k = 0; k < bs; k++)
1794: for (j = A->i[i]; j < A->i[i + 1]; j++)
1795: for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1797: PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR);
1798: PetscFree(matvals);
1800: /* write block size option to the viewer's .info file */
1801: MatView_Binary_BlockSizes(mat, viewer);
1802: return 0;
1803: }
1805: static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer)
1806: {
1807: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1808: PetscInt i, bs = A->rmap->bs, k;
1810: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1811: for (i = 0; i < a->mbs; i++) {
1812: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1);
1813: for (k = a->i[i]; k < a->i[i + 1]; k++) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1);
1814: PetscViewerASCIIPrintf(viewer, "\n");
1815: }
1816: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1817: return 0;
1818: }
1820: static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer)
1821: {
1822: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1823: PetscInt i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1824: PetscViewerFormat format;
1826: if (A->structure_only) {
1827: MatView_SeqBAIJ_ASCII_structonly(A, viewer);
1828: return 0;
1829: }
1831: PetscViewerGetFormat(viewer, &format);
1832: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1833: PetscViewerASCIIPrintf(viewer, " block size is %" PetscInt_FMT "\n", bs);
1834: } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1835: const char *matname;
1836: Mat aij;
1837: MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij);
1838: PetscObjectGetName((PetscObject)A, &matname);
1839: PetscObjectSetName((PetscObject)aij, matname);
1840: MatView(aij, viewer);
1841: MatDestroy(&aij);
1842: } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1843: return 0;
1844: } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1845: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1846: for (i = 0; i < a->mbs; i++) {
1847: for (j = 0; j < bs; j++) {
1848: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j);
1849: for (k = a->i[i]; k < a->i[i + 1]; k++) {
1850: for (l = 0; l < bs; l++) {
1851: #if defined(PETSC_USE_COMPLEX)
1852: if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1853: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1854: } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1855: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1856: } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1857: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]));
1858: }
1859: #else
1860: if (a->a[bs2 * k + l * bs + j] != 0.0) PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]);
1861: #endif
1862: }
1863: }
1864: PetscViewerASCIIPrintf(viewer, "\n");
1865: }
1866: }
1867: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1868: } else {
1869: PetscViewerASCIIUseTabs(viewer, PETSC_FALSE);
1870: for (i = 0; i < a->mbs; i++) {
1871: for (j = 0; j < bs; j++) {
1872: PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j);
1873: for (k = a->i[i]; k < a->i[i + 1]; k++) {
1874: for (l = 0; l < bs; l++) {
1875: #if defined(PETSC_USE_COMPLEX)
1876: if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1877: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1878: } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1879: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j]));
1880: } else {
1881: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]));
1882: }
1883: #else
1884: PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]);
1885: #endif
1886: }
1887: }
1888: PetscViewerASCIIPrintf(viewer, "\n");
1889: }
1890: }
1891: PetscViewerASCIIUseTabs(viewer, PETSC_TRUE);
1892: }
1893: PetscViewerFlush(viewer);
1894: return 0;
1895: }
1897: #include <petscdraw.h>
1898: static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa)
1899: {
1900: Mat A = (Mat)Aa;
1901: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1902: PetscInt row, i, j, k, l, mbs = a->mbs, color, bs = A->rmap->bs, bs2 = a->bs2;
1903: PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1904: MatScalar *aa;
1905: PetscViewer viewer;
1906: PetscViewerFormat format;
1908: PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer);
1909: PetscViewerGetFormat(viewer, &format);
1910: PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr);
1912: /* loop over matrix elements drawing boxes */
1914: if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1915: PetscDrawCollectiveBegin(draw);
1916: /* Blue for negative, Cyan for zero and Red for positive */
1917: color = PETSC_DRAW_BLUE;
1918: for (i = 0, row = 0; i < mbs; i++, row += bs) {
1919: for (j = a->i[i]; j < a->i[i + 1]; j++) {
1920: y_l = A->rmap->N - row - 1.0;
1921: y_r = y_l + 1.0;
1922: x_l = a->j[j] * bs;
1923: x_r = x_l + 1.0;
1924: aa = a->a + j * bs2;
1925: for (k = 0; k < bs; k++) {
1926: for (l = 0; l < bs; l++) {
1927: if (PetscRealPart(*aa++) >= 0.) continue;
1928: PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1929: }
1930: }
1931: }
1932: }
1933: color = PETSC_DRAW_CYAN;
1934: for (i = 0, row = 0; i < mbs; i++, row += bs) {
1935: for (j = a->i[i]; j < a->i[i + 1]; j++) {
1936: y_l = A->rmap->N - row - 1.0;
1937: y_r = y_l + 1.0;
1938: x_l = a->j[j] * bs;
1939: x_r = x_l + 1.0;
1940: aa = a->a + j * bs2;
1941: for (k = 0; k < bs; k++) {
1942: for (l = 0; l < bs; l++) {
1943: if (PetscRealPart(*aa++) != 0.) continue;
1944: PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1945: }
1946: }
1947: }
1948: }
1949: color = PETSC_DRAW_RED;
1950: for (i = 0, row = 0; i < mbs; i++, row += bs) {
1951: for (j = a->i[i]; j < a->i[i + 1]; j++) {
1952: y_l = A->rmap->N - row - 1.0;
1953: y_r = y_l + 1.0;
1954: x_l = a->j[j] * bs;
1955: x_r = x_l + 1.0;
1956: aa = a->a + j * bs2;
1957: for (k = 0; k < bs; k++) {
1958: for (l = 0; l < bs; l++) {
1959: if (PetscRealPart(*aa++) <= 0.) continue;
1960: PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1961: }
1962: }
1963: }
1964: }
1965: PetscDrawCollectiveEnd(draw);
1966: } else {
1967: /* use contour shading to indicate magnitude of values */
1968: /* first determine max of all nonzero values */
1969: PetscReal minv = 0.0, maxv = 0.0;
1970: PetscDraw popup;
1972: for (i = 0; i < a->nz * a->bs2; i++) {
1973: if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1974: }
1975: if (minv >= maxv) maxv = minv + PETSC_SMALL;
1976: PetscDrawGetPopup(draw, &popup);
1977: PetscDrawScalePopup(popup, 0.0, maxv);
1979: PetscDrawCollectiveBegin(draw);
1980: for (i = 0, row = 0; i < mbs; i++, row += bs) {
1981: for (j = a->i[i]; j < a->i[i + 1]; j++) {
1982: y_l = A->rmap->N - row - 1.0;
1983: y_r = y_l + 1.0;
1984: x_l = a->j[j] * bs;
1985: x_r = x_l + 1.0;
1986: aa = a->a + j * bs2;
1987: for (k = 0; k < bs; k++) {
1988: for (l = 0; l < bs; l++) {
1989: MatScalar v = *aa++;
1990: color = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
1991: PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color);
1992: }
1993: }
1994: }
1995: }
1996: PetscDrawCollectiveEnd(draw);
1997: }
1998: return 0;
1999: }
2001: static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer)
2002: {
2003: PetscReal xl, yl, xr, yr, w, h;
2004: PetscDraw draw;
2005: PetscBool isnull;
2007: PetscViewerDrawGetDraw(viewer, 0, &draw);
2008: PetscDrawIsNull(draw, &isnull);
2009: if (isnull) return 0;
2011: xr = A->cmap->n;
2012: yr = A->rmap->N;
2013: h = yr / 10.0;
2014: w = xr / 10.0;
2015: xr += w;
2016: yr += h;
2017: xl = -w;
2018: yl = -h;
2019: PetscDrawSetCoordinates(draw, xl, yl, xr, yr);
2020: PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer);
2021: PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A);
2022: PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL);
2023: PetscDrawSave(draw);
2024: return 0;
2025: }
2027: PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer)
2028: {
2029: PetscBool iascii, isbinary, isdraw;
2031: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
2032: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
2033: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw);
2034: if (iascii) {
2035: MatView_SeqBAIJ_ASCII(A, viewer);
2036: } else if (isbinary) {
2037: MatView_SeqBAIJ_Binary(A, viewer);
2038: } else if (isdraw) {
2039: MatView_SeqBAIJ_Draw(A, viewer);
2040: } else {
2041: Mat B;
2042: MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B);
2043: MatView(B, viewer);
2044: MatDestroy(&B);
2045: }
2046: return 0;
2047: }
2049: PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
2050: {
2051: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2052: PetscInt *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2053: PetscInt *ai = a->i, *ailen = a->ilen;
2054: PetscInt brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2055: MatScalar *ap, *aa = a->a;
2057: for (k = 0; k < m; k++) { /* loop over rows */
2058: row = im[k];
2059: brow = row / bs;
2060: if (row < 0) {
2061: v += n;
2062: continue;
2063: } /* negative row */
2065: rp = aj ? aj + ai[brow] : NULL; /* mustn't add to NULL, that is UB */
2066: ap = aa ? aa + bs2 * ai[brow] : NULL; /* mustn't add to NULL, that is UB */
2067: nrow = ailen[brow];
2068: for (l = 0; l < n; l++) { /* loop over columns */
2069: if (in[l] < 0) {
2070: v++;
2071: continue;
2072: } /* negative column */
2074: col = in[l];
2075: bcol = col / bs;
2076: cidx = col % bs;
2077: ridx = row % bs;
2078: high = nrow;
2079: low = 0; /* assume unsorted */
2080: while (high - low > 5) {
2081: t = (low + high) / 2;
2082: if (rp[t] > bcol) high = t;
2083: else low = t;
2084: }
2085: for (i = low; i < high; i++) {
2086: if (rp[i] > bcol) break;
2087: if (rp[i] == bcol) {
2088: *v++ = ap[bs2 * i + bs * cidx + ridx];
2089: goto finished;
2090: }
2091: }
2092: *v++ = 0.0;
2093: finished:;
2094: }
2095: }
2096: return 0;
2097: }
2099: PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2100: {
2101: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2102: PetscInt *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2103: PetscInt *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2104: PetscInt *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2105: PetscBool roworiented = a->roworiented;
2106: const PetscScalar *value = v;
2107: MatScalar *ap = NULL, *aa = a->a, *bap;
2109: if (roworiented) {
2110: stepval = (n - 1) * bs;
2111: } else {
2112: stepval = (m - 1) * bs;
2113: }
2114: for (k = 0; k < m; k++) { /* loop over added rows */
2115: row = im[k];
2116: if (row < 0) continue;
2118: rp = aj + ai[row];
2119: if (!A->structure_only) ap = aa + bs2 * ai[row];
2120: rmax = imax[row];
2121: nrow = ailen[row];
2122: low = 0;
2123: high = nrow;
2124: for (l = 0; l < n; l++) { /* loop over added columns */
2125: if (in[l] < 0) continue;
2127: col = in[l];
2128: if (!A->structure_only) {
2129: if (roworiented) {
2130: value = v + (k * (stepval + bs) + l) * bs;
2131: } else {
2132: value = v + (l * (stepval + bs) + k) * bs;
2133: }
2134: }
2135: if (col <= lastcol) low = 0;
2136: else high = nrow;
2137: lastcol = col;
2138: while (high - low > 7) {
2139: t = (low + high) / 2;
2140: if (rp[t] > col) high = t;
2141: else low = t;
2142: }
2143: for (i = low; i < high; i++) {
2144: if (rp[i] > col) break;
2145: if (rp[i] == col) {
2146: if (A->structure_only) goto noinsert2;
2147: bap = ap + bs2 * i;
2148: if (roworiented) {
2149: if (is == ADD_VALUES) {
2150: for (ii = 0; ii < bs; ii++, value += stepval) {
2151: for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2152: }
2153: } else {
2154: for (ii = 0; ii < bs; ii++, value += stepval) {
2155: for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2156: }
2157: }
2158: } else {
2159: if (is == ADD_VALUES) {
2160: for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2161: for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2162: bap += bs;
2163: }
2164: } else {
2165: for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2166: for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2167: bap += bs;
2168: }
2169: }
2170: }
2171: goto noinsert2;
2172: }
2173: }
2174: if (nonew == 1) goto noinsert2;
2176: if (A->structure_only) {
2177: MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2178: } else {
2179: MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2180: }
2181: N = nrow++ - 1;
2182: high++;
2183: /* shift up all the later entries in this row */
2184: PetscArraymove(rp + i + 1, rp + i, N - i + 1);
2185: rp[i] = col;
2186: if (!A->structure_only) {
2187: PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1));
2188: bap = ap + bs2 * i;
2189: if (roworiented) {
2190: for (ii = 0; ii < bs; ii++, value += stepval) {
2191: for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2192: }
2193: } else {
2194: for (ii = 0; ii < bs; ii++, value += stepval) {
2195: for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2196: }
2197: }
2198: }
2199: noinsert2:;
2200: low = i;
2201: }
2202: ailen[row] = nrow;
2203: }
2204: return 0;
2205: }
2207: PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode)
2208: {
2209: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2210: PetscInt fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2211: PetscInt m = A->rmap->N, *ip, N, *ailen = a->ilen;
2212: PetscInt mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2213: MatScalar *aa = a->a, *ap;
2214: PetscReal ratio = 0.6;
2216: if (mode == MAT_FLUSH_ASSEMBLY) return 0;
2218: if (m) rmax = ailen[0];
2219: for (i = 1; i < mbs; i++) {
2220: /* move each row back by the amount of empty slots (fshift) before it*/
2221: fshift += imax[i - 1] - ailen[i - 1];
2222: rmax = PetscMax(rmax, ailen[i]);
2223: if (fshift) {
2224: ip = aj + ai[i];
2225: ap = aa + bs2 * ai[i];
2226: N = ailen[i];
2227: PetscArraymove(ip - fshift, ip, N);
2228: if (!A->structure_only) PetscArraymove(ap - bs2 * fshift, ap, bs2 * N);
2229: }
2230: ai[i] = ai[i - 1] + ailen[i - 1];
2231: }
2232: if (mbs) {
2233: fshift += imax[mbs - 1] - ailen[mbs - 1];
2234: ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2235: }
2237: /* reset ilen and imax for each row */
2238: a->nonzerorowcnt = 0;
2239: if (A->structure_only) {
2240: PetscFree2(a->imax, a->ilen);
2241: } else { /* !A->structure_only */
2242: for (i = 0; i < mbs; i++) {
2243: ailen[i] = imax[i] = ai[i + 1] - ai[i];
2244: a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2245: }
2246: }
2247: a->nz = ai[mbs];
2249: /* diagonals may have moved, so kill the diagonal pointers */
2250: a->idiagvalid = PETSC_FALSE;
2251: if (fshift && a->diag) {
2252: PetscFree(a->diag);
2253: a->diag = NULL;
2254: }
2256: PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2);
2257: PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs);
2258: PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax);
2260: A->info.mallocs += a->reallocs;
2261: a->reallocs = 0;
2262: A->info.nz_unneeded = (PetscReal)fshift * bs2;
2263: a->rmax = rmax;
2265: if (!A->structure_only) MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio);
2266: return 0;
2267: }
2269: /*
2270: This function returns an array of flags which indicate the locations of contiguous
2271: blocks that should be zeroed. for eg: if bs = 3 and is = [0,1,2,3,5,6,7,8,9]
2272: then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2273: Assume: sizes should be long enough to hold all the values.
2274: */
2275: static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max)
2276: {
2277: PetscInt i, j, k, row;
2278: PetscBool flg;
2280: for (i = 0, j = 0; i < n; j++) {
2281: row = idx[i];
2282: if (row % bs != 0) { /* Not the beginning of a block */
2283: sizes[j] = 1;
2284: i++;
2285: } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2286: sizes[j] = 1; /* Also makes sure at least 'bs' values exist for next else */
2287: i++;
2288: } else { /* Beginning of the block, so check if the complete block exists */
2289: flg = PETSC_TRUE;
2290: for (k = 1; k < bs; k++) {
2291: if (row + k != idx[i + k]) { /* break in the block */
2292: flg = PETSC_FALSE;
2293: break;
2294: }
2295: }
2296: if (flg) { /* No break in the bs */
2297: sizes[j] = bs;
2298: i += bs;
2299: } else {
2300: sizes[j] = 1;
2301: i++;
2302: }
2303: }
2304: }
2305: *bs_max = j;
2306: return 0;
2307: }
2309: PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2310: {
2311: Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)A->data;
2312: PetscInt i, j, k, count, *rows;
2313: PetscInt bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2314: PetscScalar zero = 0.0;
2315: MatScalar *aa;
2316: const PetscScalar *xx;
2317: PetscScalar *bb;
2319: /* fix right hand side if needed */
2320: if (x && b) {
2321: VecGetArrayRead(x, &xx);
2322: VecGetArray(b, &bb);
2323: for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2324: VecRestoreArrayRead(x, &xx);
2325: VecRestoreArray(b, &bb);
2326: }
2328: /* Make a copy of the IS and sort it */
2329: /* allocate memory for rows,sizes */
2330: PetscMalloc2(is_n, &rows, 2 * is_n, &sizes);
2332: /* copy IS values to rows, and sort them */
2333: for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2334: PetscSortInt(is_n, rows);
2336: if (baij->keepnonzeropattern) {
2337: for (i = 0; i < is_n; i++) sizes[i] = 1;
2338: bs_max = is_n;
2339: } else {
2340: MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max);
2341: A->nonzerostate++;
2342: }
2344: for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2345: row = rows[j];
2347: count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2348: aa = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2349: if (sizes[i] == bs && !baij->keepnonzeropattern) {
2350: if (diag != (PetscScalar)0.0) {
2351: if (baij->ilen[row / bs] > 0) {
2352: baij->ilen[row / bs] = 1;
2353: baij->j[baij->i[row / bs]] = row / bs;
2355: PetscArrayzero(aa, count * bs);
2356: }
2357: /* Now insert all the diagonal values for this bs */
2358: for (k = 0; k < bs; k++) (*A->ops->setvalues)(A, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES);
2359: } else { /* (diag == 0.0) */
2360: baij->ilen[row / bs] = 0;
2361: } /* end (diag == 0.0) */
2362: } else { /* (sizes[i] != bs) */
2363: PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2364: for (k = 0; k < count; k++) {
2365: aa[0] = zero;
2366: aa += bs;
2367: }
2368: if (diag != (PetscScalar)0.0) (*A->ops->setvalues)(A, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES);
2369: }
2370: }
2372: PetscFree2(rows, sizes);
2373: MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY);
2374: return 0;
2375: }
2377: PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2378: {
2379: Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)A->data;
2380: PetscInt i, j, k, count;
2381: PetscInt bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2382: PetscScalar zero = 0.0;
2383: MatScalar *aa;
2384: const PetscScalar *xx;
2385: PetscScalar *bb;
2386: PetscBool *zeroed, vecs = PETSC_FALSE;
2388: /* fix right hand side if needed */
2389: if (x && b) {
2390: VecGetArrayRead(x, &xx);
2391: VecGetArray(b, &bb);
2392: vecs = PETSC_TRUE;
2393: }
2395: /* zero the columns */
2396: PetscCalloc1(A->rmap->n, &zeroed);
2397: for (i = 0; i < is_n; i++) {
2399: zeroed[is_idx[i]] = PETSC_TRUE;
2400: }
2401: for (i = 0; i < A->rmap->N; i++) {
2402: if (!zeroed[i]) {
2403: row = i / bs;
2404: for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2405: for (k = 0; k < bs; k++) {
2406: col = bs * baij->j[j] + k;
2407: if (zeroed[col]) {
2408: aa = ((MatScalar *)(baij->a)) + j * bs2 + (i % bs) + bs * k;
2409: if (vecs) bb[i] -= aa[0] * xx[col];
2410: aa[0] = 0.0;
2411: }
2412: }
2413: }
2414: } else if (vecs) bb[i] = diag * xx[i];
2415: }
2416: PetscFree(zeroed);
2417: if (vecs) {
2418: VecRestoreArrayRead(x, &xx);
2419: VecRestoreArray(b, &bb);
2420: }
2422: /* zero the rows */
2423: for (i = 0; i < is_n; i++) {
2424: row = is_idx[i];
2425: count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2426: aa = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2427: for (k = 0; k < count; k++) {
2428: aa[0] = zero;
2429: aa += bs;
2430: }
2431: if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2432: }
2433: MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY);
2434: return 0;
2435: }
2437: PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2438: {
2439: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2440: PetscInt *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2441: PetscInt *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2442: PetscInt *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2443: PetscInt ridx, cidx, bs2 = a->bs2;
2444: PetscBool roworiented = a->roworiented;
2445: MatScalar *ap = NULL, value = 0.0, *aa = a->a, *bap;
2447: for (k = 0; k < m; k++) { /* loop over added rows */
2448: row = im[k];
2449: brow = row / bs;
2450: if (row < 0) continue;
2452: rp = aj + ai[brow];
2453: if (!A->structure_only) ap = aa + bs2 * ai[brow];
2454: rmax = imax[brow];
2455: nrow = ailen[brow];
2456: low = 0;
2457: high = nrow;
2458: for (l = 0; l < n; l++) { /* loop over added columns */
2459: if (in[l] < 0) continue;
2461: col = in[l];
2462: bcol = col / bs;
2463: ridx = row % bs;
2464: cidx = col % bs;
2465: if (!A->structure_only) {
2466: if (roworiented) {
2467: value = v[l + k * n];
2468: } else {
2469: value = v[k + l * m];
2470: }
2471: }
2472: if (col <= lastcol) low = 0;
2473: else high = nrow;
2474: lastcol = col;
2475: while (high - low > 7) {
2476: t = (low + high) / 2;
2477: if (rp[t] > bcol) high = t;
2478: else low = t;
2479: }
2480: for (i = low; i < high; i++) {
2481: if (rp[i] > bcol) break;
2482: if (rp[i] == bcol) {
2483: bap = ap + bs2 * i + bs * cidx + ridx;
2484: if (!A->structure_only) {
2485: if (is == ADD_VALUES) *bap += value;
2486: else *bap = value;
2487: }
2488: goto noinsert1;
2489: }
2490: }
2491: if (nonew == 1) goto noinsert1;
2493: if (A->structure_only) {
2494: MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2495: } else {
2496: MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2497: }
2498: N = nrow++ - 1;
2499: high++;
2500: /* shift up all the later entries in this row */
2501: PetscArraymove(rp + i + 1, rp + i, N - i + 1);
2502: rp[i] = bcol;
2503: if (!A->structure_only) {
2504: PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1));
2505: PetscArrayzero(ap + bs2 * i, bs2);
2506: ap[bs2 * i + bs * cidx + ridx] = value;
2507: }
2508: a->nz++;
2509: A->nonzerostate++;
2510: noinsert1:;
2511: low = i;
2512: }
2513: ailen[brow] = nrow;
2514: }
2515: return 0;
2516: }
2518: PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info)
2519: {
2520: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2521: Mat outA;
2522: PetscBool row_identity, col_identity;
2525: ISIdentity(row, &row_identity);
2526: ISIdentity(col, &col_identity);
2529: outA = inA;
2530: inA->factortype = MAT_FACTOR_LU;
2531: PetscFree(inA->solvertype);
2532: PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype);
2534: MatMarkDiagonal_SeqBAIJ(inA);
2536: PetscObjectReference((PetscObject)row);
2537: ISDestroy(&a->row);
2538: a->row = row;
2539: PetscObjectReference((PetscObject)col);
2540: ISDestroy(&a->col);
2541: a->col = col;
2543: /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2544: ISDestroy(&a->icol);
2545: ISInvertPermutation(col, PETSC_DECIDE, &a->icol);
2547: MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity));
2548: if (!a->solve_work) { PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work); }
2549: MatLUFactorNumeric(outA, inA, info);
2550: return 0;
2551: }
2553: PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, PetscInt *indices)
2554: {
2555: Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;
2556: PetscInt i, nz, mbs;
2558: nz = baij->maxnz;
2559: mbs = baij->mbs;
2560: for (i = 0; i < nz; i++) baij->j[i] = indices[i];
2561: baij->nz = nz;
2562: for (i = 0; i < mbs; i++) baij->ilen[i] = baij->imax[i];
2563: return 0;
2564: }
2566: /*@
2567: MatSeqBAIJSetColumnIndices - Set the column indices for all the rows in the matrix.
2569: Input Parameters:
2570: + mat - the `MATSEQBAIJ` matrix
2571: - indices - the column indices
2573: Level: advanced
2575: Notes:
2576: This can be called if you have precomputed the nonzero structure of the
2577: matrix and want to provide it to the matrix object to improve the performance
2578: of the `MatSetValues()` operation.
2580: You MUST have set the correct numbers of nonzeros per row in the call to
2581: `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.
2583: MUST be called before any calls to `MatSetValues()`
2585: .seealso: `MATSEQBAIJ`, `MatSetValues()`
2586: @*/
2587: PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices)
2588: {
2591: PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, PetscInt *), (mat, indices));
2592: return 0;
2593: }
2595: PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[])
2596: {
2597: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2598: PetscInt i, j, n, row, bs, *ai, *aj, mbs;
2599: PetscReal atmp;
2600: PetscScalar *x, zero = 0.0;
2601: MatScalar *aa;
2602: PetscInt ncols, brow, krow, kcol;
2604: /* why is this not a macro???????????????????????????????????????????????????????????????? */
2606: bs = A->rmap->bs;
2607: aa = a->a;
2608: ai = a->i;
2609: aj = a->j;
2610: mbs = a->mbs;
2612: VecSet(v, zero);
2613: VecGetArray(v, &x);
2614: VecGetLocalSize(v, &n);
2616: for (i = 0; i < mbs; i++) {
2617: ncols = ai[1] - ai[0];
2618: ai++;
2619: brow = bs * i;
2620: for (j = 0; j < ncols; j++) {
2621: for (kcol = 0; kcol < bs; kcol++) {
2622: for (krow = 0; krow < bs; krow++) {
2623: atmp = PetscAbsScalar(*aa);
2624: aa++;
2625: row = brow + krow; /* row index */
2626: if (PetscAbsScalar(x[row]) < atmp) {
2627: x[row] = atmp;
2628: if (idx) idx[row] = bs * (*aj) + kcol;
2629: }
2630: }
2631: }
2632: aj++;
2633: }
2634: }
2635: VecRestoreArray(v, &x);
2636: return 0;
2637: }
2639: PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str)
2640: {
2641: /* If the two matrices have the same copy implementation, use fast copy. */
2642: if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2643: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2644: Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)B->data;
2645: PetscInt ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;
2649: PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]);
2650: PetscObjectStateIncrease((PetscObject)B);
2651: } else {
2652: MatCopy_Basic(A, B, str);
2653: }
2654: return 0;
2655: }
2657: PetscErrorCode MatSetUp_SeqBAIJ(Mat A)
2658: {
2659: MatSeqBAIJSetPreallocation(A, A->rmap->bs, PETSC_DEFAULT, NULL);
2660: return 0;
2661: }
2663: static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[])
2664: {
2665: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2667: *array = a->a;
2668: return 0;
2669: }
2671: static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[])
2672: {
2673: *array = NULL;
2674: return 0;
2675: }
2677: PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz)
2678: {
2679: PetscInt bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2680: Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2681: Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;
2683: /* Set the number of nonzeros in the new matrix */
2684: MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz);
2685: return 0;
2686: }
2688: PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2689: {
2690: Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2691: PetscInt bs = Y->rmap->bs, bs2 = bs * bs;
2692: PetscBLASInt one = 1;
2694: if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2695: PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2696: if (e) {
2697: PetscArraycmp(x->i, y->i, x->mbs + 1, &e);
2698: if (e) {
2699: PetscArraycmp(x->j, y->j, x->i[x->mbs], &e);
2700: if (e) str = SAME_NONZERO_PATTERN;
2701: }
2702: }
2704: }
2705: if (str == SAME_NONZERO_PATTERN) {
2706: PetscScalar alpha = a;
2707: PetscBLASInt bnz;
2708: PetscBLASIntCast(x->nz * bs2, &bnz);
2709: PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2710: PetscObjectStateIncrease((PetscObject)Y);
2711: } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2712: MatAXPY_Basic(Y, a, X, str);
2713: } else {
2714: Mat B;
2715: PetscInt *nnz;
2717: PetscMalloc1(Y->rmap->N, &nnz);
2718: MatCreate(PetscObjectComm((PetscObject)Y), &B);
2719: PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name);
2720: MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N);
2721: MatSetBlockSizesFromMats(B, Y, Y);
2722: MatSetType(B, (MatType)((PetscObject)Y)->type_name);
2723: MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz);
2724: MatSeqBAIJSetPreallocation(B, bs, 0, nnz);
2725: MatAXPY_BasicWithPreallocation(B, Y, a, X, str);
2726: MatHeaderMerge(Y, &B);
2727: PetscFree(nnz);
2728: }
2729: return 0;
2730: }
2732: PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2733: {
2734: #if defined(PETSC_USE_COMPLEX)
2735: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2736: PetscInt i, nz = a->bs2 * a->i[a->mbs];
2737: MatScalar *aa = a->a;
2739: for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2740: #else
2741: #endif
2742: return 0;
2743: }
2745: PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2746: {
2747: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2748: PetscInt i, nz = a->bs2 * a->i[a->mbs];
2749: MatScalar *aa = a->a;
2751: for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2752: return 0;
2753: }
2755: PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2756: {
2757: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2758: PetscInt i, nz = a->bs2 * a->i[a->mbs];
2759: MatScalar *aa = a->a;
2761: for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2762: return 0;
2763: }
2765: /*
2766: Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2767: */
2768: PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2769: {
2770: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2771: PetscInt bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2772: PetscInt nz = a->i[m], row, *jj, mr, col;
2774: *nn = n;
2775: if (!ia) return 0;
2777: PetscCalloc1(n, &collengths);
2778: PetscMalloc1(n + 1, &cia);
2779: PetscMalloc1(nz, &cja);
2780: jj = a->j;
2781: for (i = 0; i < nz; i++) collengths[jj[i]]++;
2782: cia[0] = oshift;
2783: for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2784: PetscArrayzero(collengths, n);
2785: jj = a->j;
2786: for (row = 0; row < m; row++) {
2787: mr = a->i[row + 1] - a->i[row];
2788: for (i = 0; i < mr; i++) {
2789: col = *jj++;
2791: cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2792: }
2793: }
2794: PetscFree(collengths);
2795: *ia = cia;
2796: *ja = cja;
2797: return 0;
2798: }
2800: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2801: {
2802: if (!ia) return 0;
2803: PetscFree(*ia);
2804: PetscFree(*ja);
2805: return 0;
2806: }
2808: /*
2809: MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2810: MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2811: spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2812: */
2813: PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2814: {
2815: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2816: PetscInt i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2817: PetscInt nz = a->i[m], row, *jj, mr, col;
2818: PetscInt *cspidx;
2820: *nn = n;
2821: if (!ia) return 0;
2823: PetscCalloc1(n, &collengths);
2824: PetscMalloc1(n + 1, &cia);
2825: PetscMalloc1(nz, &cja);
2826: PetscMalloc1(nz, &cspidx);
2827: jj = a->j;
2828: for (i = 0; i < nz; i++) collengths[jj[i]]++;
2829: cia[0] = oshift;
2830: for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2831: PetscArrayzero(collengths, n);
2832: jj = a->j;
2833: for (row = 0; row < m; row++) {
2834: mr = a->i[row + 1] - a->i[row];
2835: for (i = 0; i < mr; i++) {
2836: col = *jj++;
2837: cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2838: cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2839: }
2840: }
2841: PetscFree(collengths);
2842: *ia = cia;
2843: *ja = cja;
2844: *spidx = cspidx;
2845: return 0;
2846: }
2848: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2849: {
2850: MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done);
2851: PetscFree(*spidx);
2852: return 0;
2853: }
2855: PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a)
2856: {
2857: Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;
2859: if (!Y->preallocated || !aij->nz) MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL);
2860: MatShift_Basic(Y, a);
2861: return 0;
2862: }
2864: /* -------------------------------------------------------------------*/
2865: static struct _MatOps MatOps_Values = {
2866: MatSetValues_SeqBAIJ,
2867: MatGetRow_SeqBAIJ,
2868: MatRestoreRow_SeqBAIJ,
2869: MatMult_SeqBAIJ_N,
2870: /* 4*/ MatMultAdd_SeqBAIJ_N,
2871: MatMultTranspose_SeqBAIJ,
2872: MatMultTransposeAdd_SeqBAIJ,
2873: NULL,
2874: NULL,
2875: NULL,
2876: /* 10*/ NULL,
2877: MatLUFactor_SeqBAIJ,
2878: NULL,
2879: NULL,
2880: MatTranspose_SeqBAIJ,
2881: /* 15*/ MatGetInfo_SeqBAIJ,
2882: MatEqual_SeqBAIJ,
2883: MatGetDiagonal_SeqBAIJ,
2884: MatDiagonalScale_SeqBAIJ,
2885: MatNorm_SeqBAIJ,
2886: /* 20*/ NULL,
2887: MatAssemblyEnd_SeqBAIJ,
2888: MatSetOption_SeqBAIJ,
2889: MatZeroEntries_SeqBAIJ,
2890: /* 24*/ MatZeroRows_SeqBAIJ,
2891: NULL,
2892: NULL,
2893: NULL,
2894: NULL,
2895: /* 29*/ MatSetUp_SeqBAIJ,
2896: NULL,
2897: NULL,
2898: NULL,
2899: NULL,
2900: /* 34*/ MatDuplicate_SeqBAIJ,
2901: NULL,
2902: NULL,
2903: MatILUFactor_SeqBAIJ,
2904: NULL,
2905: /* 39*/ MatAXPY_SeqBAIJ,
2906: MatCreateSubMatrices_SeqBAIJ,
2907: MatIncreaseOverlap_SeqBAIJ,
2908: MatGetValues_SeqBAIJ,
2909: MatCopy_SeqBAIJ,
2910: /* 44*/ NULL,
2911: MatScale_SeqBAIJ,
2912: MatShift_SeqBAIJ,
2913: NULL,
2914: MatZeroRowsColumns_SeqBAIJ,
2915: /* 49*/ NULL,
2916: MatGetRowIJ_SeqBAIJ,
2917: MatRestoreRowIJ_SeqBAIJ,
2918: MatGetColumnIJ_SeqBAIJ,
2919: MatRestoreColumnIJ_SeqBAIJ,
2920: /* 54*/ MatFDColoringCreate_SeqXAIJ,
2921: NULL,
2922: NULL,
2923: NULL,
2924: MatSetValuesBlocked_SeqBAIJ,
2925: /* 59*/ MatCreateSubMatrix_SeqBAIJ,
2926: MatDestroy_SeqBAIJ,
2927: MatView_SeqBAIJ,
2928: NULL,
2929: NULL,
2930: /* 64*/ NULL,
2931: NULL,
2932: NULL,
2933: NULL,
2934: NULL,
2935: /* 69*/ MatGetRowMaxAbs_SeqBAIJ,
2936: NULL,
2937: MatConvert_Basic,
2938: NULL,
2939: NULL,
2940: /* 74*/ NULL,
2941: MatFDColoringApply_BAIJ,
2942: NULL,
2943: NULL,
2944: NULL,
2945: /* 79*/ NULL,
2946: NULL,
2947: NULL,
2948: NULL,
2949: MatLoad_SeqBAIJ,
2950: /* 84*/ NULL,
2951: NULL,
2952: NULL,
2953: NULL,
2954: NULL,
2955: /* 89*/ NULL,
2956: NULL,
2957: NULL,
2958: NULL,
2959: NULL,
2960: /* 94*/ NULL,
2961: NULL,
2962: NULL,
2963: NULL,
2964: NULL,
2965: /* 99*/ NULL,
2966: NULL,
2967: NULL,
2968: MatConjugate_SeqBAIJ,
2969: NULL,
2970: /*104*/ NULL,
2971: MatRealPart_SeqBAIJ,
2972: MatImaginaryPart_SeqBAIJ,
2973: NULL,
2974: NULL,
2975: /*109*/ NULL,
2976: NULL,
2977: NULL,
2978: NULL,
2979: MatMissingDiagonal_SeqBAIJ,
2980: /*114*/ NULL,
2981: NULL,
2982: NULL,
2983: NULL,
2984: NULL,
2985: /*119*/ NULL,
2986: NULL,
2987: MatMultHermitianTranspose_SeqBAIJ,
2988: MatMultHermitianTransposeAdd_SeqBAIJ,
2989: NULL,
2990: /*124*/ NULL,
2991: MatGetColumnReductions_SeqBAIJ,
2992: MatInvertBlockDiagonal_SeqBAIJ,
2993: NULL,
2994: NULL,
2995: /*129*/ NULL,
2996: NULL,
2997: NULL,
2998: NULL,
2999: NULL,
3000: /*134*/ NULL,
3001: NULL,
3002: NULL,
3003: NULL,
3004: NULL,
3005: /*139*/ MatSetBlockSizes_Default,
3006: NULL,
3007: NULL,
3008: MatFDColoringSetUp_SeqXAIJ,
3009: NULL,
3010: /*144*/ MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
3011: MatDestroySubMatrices_SeqBAIJ,
3012: NULL,
3013: NULL,
3014: NULL,
3015: NULL,
3016: /*150*/ NULL,
3017: };
3019: PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat)
3020: {
3021: Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3022: PetscInt nz = aij->i[aij->mbs] * aij->bs2;
3026: /* allocate space for values if not already there */
3027: if (!aij->saved_values) { PetscMalloc1(nz + 1, &aij->saved_values); }
3029: /* copy values over */
3030: PetscArraycpy(aij->saved_values, aij->a, nz);
3031: return 0;
3032: }
3034: PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat)
3035: {
3036: Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3037: PetscInt nz = aij->i[aij->mbs] * aij->bs2;
3042: /* copy values over */
3043: PetscArraycpy(aij->a, aij->saved_values, nz);
3044: return 0;
3045: }
3047: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3048: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);
3050: PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, PetscInt *nnz)
3051: {
3052: Mat_SeqBAIJ *b;
3053: PetscInt i, mbs, nbs, bs2;
3054: PetscBool flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
3056: if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3057: if (nz == MAT_SKIP_ALLOCATION) {
3058: skipallocation = PETSC_TRUE;
3059: nz = 0;
3060: }
3062: MatSetBlockSize(B, PetscAbs(bs));
3063: PetscLayoutSetUp(B->rmap);
3064: PetscLayoutSetUp(B->cmap);
3065: PetscLayoutGetBlockSize(B->rmap, &bs);
3067: B->preallocated = PETSC_TRUE;
3069: mbs = B->rmap->n / bs;
3070: nbs = B->cmap->n / bs;
3071: bs2 = bs * bs;
3075: if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3077: if (nnz) {
3078: for (i = 0; i < mbs; i++) {
3081: }
3082: }
3084: b = (Mat_SeqBAIJ *)B->data;
3085: PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3086: PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL);
3087: PetscOptionsEnd();
3089: if (!flg) {
3090: switch (bs) {
3091: case 1:
3092: B->ops->mult = MatMult_SeqBAIJ_1;
3093: B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3094: break;
3095: case 2:
3096: B->ops->mult = MatMult_SeqBAIJ_2;
3097: B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3098: break;
3099: case 3:
3100: B->ops->mult = MatMult_SeqBAIJ_3;
3101: B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3102: break;
3103: case 4:
3104: B->ops->mult = MatMult_SeqBAIJ_4;
3105: B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3106: break;
3107: case 5:
3108: B->ops->mult = MatMult_SeqBAIJ_5;
3109: B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3110: break;
3111: case 6:
3112: B->ops->mult = MatMult_SeqBAIJ_6;
3113: B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3114: break;
3115: case 7:
3116: B->ops->mult = MatMult_SeqBAIJ_7;
3117: B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3118: break;
3119: case 9: {
3120: PetscInt version = 1;
3121: PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3122: switch (version) {
3123: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3124: case 1:
3125: B->ops->mult = MatMult_SeqBAIJ_9_AVX2;
3126: B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3127: PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3128: break;
3129: #endif
3130: default:
3131: B->ops->mult = MatMult_SeqBAIJ_N;
3132: B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3133: PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3134: break;
3135: }
3136: break;
3137: }
3138: case 11:
3139: B->ops->mult = MatMult_SeqBAIJ_11;
3140: B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3141: break;
3142: case 12: {
3143: PetscInt version = 1;
3144: PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3145: switch (version) {
3146: case 1:
3147: B->ops->mult = MatMult_SeqBAIJ_12_ver1;
3148: B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3149: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3150: break;
3151: case 2:
3152: B->ops->mult = MatMult_SeqBAIJ_12_ver2;
3153: B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3154: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3155: break;
3156: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3157: case 3:
3158: B->ops->mult = MatMult_SeqBAIJ_12_AVX2;
3159: B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3160: PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3161: break;
3162: #endif
3163: default:
3164: B->ops->mult = MatMult_SeqBAIJ_N;
3165: B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3166: PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3167: break;
3168: }
3169: break;
3170: }
3171: case 15: {
3172: PetscInt version = 1;
3173: PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL);
3174: switch (version) {
3175: case 1:
3176: B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3177: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3178: break;
3179: case 2:
3180: B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3181: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3182: break;
3183: case 3:
3184: B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3185: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3186: break;
3187: case 4:
3188: B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3189: PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs);
3190: break;
3191: default:
3192: B->ops->mult = MatMult_SeqBAIJ_N;
3193: PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3194: break;
3195: }
3196: B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3197: break;
3198: }
3199: default:
3200: B->ops->mult = MatMult_SeqBAIJ_N;
3201: B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3202: PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs);
3203: break;
3204: }
3205: }
3206: B->ops->sor = MatSOR_SeqBAIJ;
3207: b->mbs = mbs;
3208: b->nbs = nbs;
3209: if (!skipallocation) {
3210: if (!b->imax) {
3211: PetscMalloc2(mbs, &b->imax, mbs, &b->ilen);
3213: b->free_imax_ilen = PETSC_TRUE;
3214: }
3215: /* b->ilen will count nonzeros in each block row so far. */
3216: for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3217: if (!nnz) {
3218: if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3219: else if (nz < 0) nz = 1;
3220: nz = PetscMin(nz, nbs);
3221: for (i = 0; i < mbs; i++) b->imax[i] = nz;
3222: PetscIntMultError(nz, mbs, &nz);
3223: } else {
3224: PetscInt64 nz64 = 0;
3225: for (i = 0; i < mbs; i++) {
3226: b->imax[i] = nnz[i];
3227: nz64 += nnz[i];
3228: }
3229: PetscIntCast(nz64, &nz);
3230: }
3232: /* allocate the matrix space */
3233: MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i);
3234: if (B->structure_only) {
3235: PetscMalloc1(nz, &b->j);
3236: PetscMalloc1(B->rmap->N + 1, &b->i);
3237: } else {
3238: PetscInt nzbs2 = 0;
3239: PetscIntMultError(nz, bs2, &nzbs2);
3240: PetscMalloc3(nzbs2, &b->a, nz, &b->j, B->rmap->N + 1, &b->i);
3241: PetscArrayzero(b->a, nz * bs2);
3242: }
3243: PetscArrayzero(b->j, nz);
3245: if (B->structure_only) {
3246: b->singlemalloc = PETSC_FALSE;
3247: b->free_a = PETSC_FALSE;
3248: } else {
3249: b->singlemalloc = PETSC_TRUE;
3250: b->free_a = PETSC_TRUE;
3251: }
3252: b->free_ij = PETSC_TRUE;
3254: b->i[0] = 0;
3255: for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];
3257: } else {
3258: b->free_a = PETSC_FALSE;
3259: b->free_ij = PETSC_FALSE;
3260: }
3262: b->bs2 = bs2;
3263: b->mbs = mbs;
3264: b->nz = 0;
3265: b->maxnz = nz;
3266: B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3267: B->was_assembled = PETSC_FALSE;
3268: B->assembled = PETSC_FALSE;
3269: if (realalloc) MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
3270: return 0;
3271: }
3273: PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[])
3274: {
3275: PetscInt i, m, nz, nz_max = 0, *nnz;
3276: PetscScalar *values = NULL;
3277: PetscBool roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;
3280: PetscLayoutSetBlockSize(B->rmap, bs);
3281: PetscLayoutSetBlockSize(B->cmap, bs);
3282: PetscLayoutSetUp(B->rmap);
3283: PetscLayoutSetUp(B->cmap);
3284: PetscLayoutGetBlockSize(B->rmap, &bs);
3285: m = B->rmap->n / bs;
3288: PetscMalloc1(m + 1, &nnz);
3289: for (i = 0; i < m; i++) {
3290: nz = ii[i + 1] - ii[i];
3292: nz_max = PetscMax(nz_max, nz);
3293: nnz[i] = nz;
3294: }
3295: MatSeqBAIJSetPreallocation(B, bs, 0, nnz);
3296: PetscFree(nnz);
3298: values = (PetscScalar *)V;
3299: if (!values) PetscCalloc1(bs * bs * (nz_max + 1), &values);
3300: for (i = 0; i < m; i++) {
3301: PetscInt ncols = ii[i + 1] - ii[i];
3302: const PetscInt *icols = jj + ii[i];
3303: if (bs == 1 || !roworiented) {
3304: const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3305: MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES);
3306: } else {
3307: PetscInt j;
3308: for (j = 0; j < ncols; j++) {
3309: const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3310: MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES);
3311: }
3312: }
3313: }
3314: if (!V) PetscFree(values);
3315: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
3316: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
3317: MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3318: return 0;
3319: }
3321: /*@C
3322: MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored
3324: Not Collective
3326: Input Parameter:
3327: . mat - a `MATSEQBAIJ` matrix
3329: Output Parameter:
3330: . array - pointer to the data
3332: Level: intermediate
3334: .seealso: `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3335: @*/
3336: PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar **array)
3337: {
3338: PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3339: return 0;
3340: }
3342: /*@C
3343: MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`
3345: Not Collective
3347: Input Parameters:
3348: + mat - a `MATSEQBAIJ` matrix
3349: - array - pointer to the data
3351: Level: intermediate
3353: .seealso: `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3354: @*/
3355: PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar **array)
3356: {
3357: PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3358: return 0;
3359: }
3361: /*MC
3362: MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3363: block sparse compressed row format.
3365: Options Database Keys:
3366: + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions()
3367: - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)
3369: Level: beginner
3371: Notes:
3372: `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3373: space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
3375: Run with -info to see what version of the matrix-vector product is being used
3377: .seealso: `MatCreateSeqBAIJ()`
3378: M*/
3380: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);
3382: PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3383: {
3384: PetscMPIInt size;
3385: Mat_SeqBAIJ *b;
3387: MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
3390: PetscNew(&b);
3391: B->data = (void *)b;
3392: PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps));
3394: b->row = NULL;
3395: b->col = NULL;
3396: b->icol = NULL;
3397: b->reallocs = 0;
3398: b->saved_values = NULL;
3400: b->roworiented = PETSC_TRUE;
3401: b->nonew = 0;
3402: b->diag = NULL;
3403: B->spptr = NULL;
3404: B->info.nz_unneeded = (PetscReal)b->maxnz * b->bs2;
3405: b->keepnonzeropattern = PETSC_FALSE;
3407: PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ);
3408: PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ);
3409: PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ);
3410: PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ);
3411: PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ);
3412: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ);
3413: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ);
3414: PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ);
3415: PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ);
3416: PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ);
3417: #if defined(PETSC_HAVE_HYPRE)
3418: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE);
3419: #endif
3420: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS);
3421: PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ);
3422: return 0;
3423: }
3425: PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
3426: {
3427: Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3428: PetscInt i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;
3432: if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3433: c->imax = a->imax;
3434: c->ilen = a->ilen;
3435: c->free_imax_ilen = PETSC_FALSE;
3436: } else {
3437: PetscMalloc2(mbs, &c->imax, mbs, &c->ilen);
3438: for (i = 0; i < mbs; i++) {
3439: c->imax[i] = a->imax[i];
3440: c->ilen[i] = a->ilen[i];
3441: }
3442: c->free_imax_ilen = PETSC_TRUE;
3443: }
3445: /* allocate the matrix space */
3446: if (mallocmatspace) {
3447: if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3448: PetscCalloc1(bs2 * nz, &c->a);
3450: c->i = a->i;
3451: c->j = a->j;
3452: c->singlemalloc = PETSC_FALSE;
3453: c->free_a = PETSC_TRUE;
3454: c->free_ij = PETSC_FALSE;
3455: c->parent = A;
3456: C->preallocated = PETSC_TRUE;
3457: C->assembled = PETSC_TRUE;
3459: PetscObjectReference((PetscObject)A);
3460: MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3461: MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE);
3462: } else {
3463: PetscMalloc3(bs2 * nz, &c->a, nz, &c->j, mbs + 1, &c->i);
3465: c->singlemalloc = PETSC_TRUE;
3466: c->free_a = PETSC_TRUE;
3467: c->free_ij = PETSC_TRUE;
3469: PetscArraycpy(c->i, a->i, mbs + 1);
3470: if (mbs > 0) {
3471: PetscArraycpy(c->j, a->j, nz);
3472: if (cpvalues == MAT_COPY_VALUES) {
3473: PetscArraycpy(c->a, a->a, bs2 * nz);
3474: } else {
3475: PetscArrayzero(c->a, bs2 * nz);
3476: }
3477: }
3478: C->preallocated = PETSC_TRUE;
3479: C->assembled = PETSC_TRUE;
3480: }
3481: }
3483: c->roworiented = a->roworiented;
3484: c->nonew = a->nonew;
3486: PetscLayoutReference(A->rmap, &C->rmap);
3487: PetscLayoutReference(A->cmap, &C->cmap);
3489: c->bs2 = a->bs2;
3490: c->mbs = a->mbs;
3491: c->nbs = a->nbs;
3493: if (a->diag) {
3494: if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3495: c->diag = a->diag;
3496: c->free_diag = PETSC_FALSE;
3497: } else {
3498: PetscMalloc1(mbs + 1, &c->diag);
3499: for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i];
3500: c->free_diag = PETSC_TRUE;
3501: }
3502: } else c->diag = NULL;
3504: c->nz = a->nz;
3505: c->maxnz = a->nz; /* Since we allocate exactly the right amount */
3506: c->solve_work = NULL;
3507: c->mult_work = NULL;
3508: c->sor_workt = NULL;
3509: c->sor_work = NULL;
3511: c->compressedrow.use = a->compressedrow.use;
3512: c->compressedrow.nrows = a->compressedrow.nrows;
3513: if (a->compressedrow.use) {
3514: i = a->compressedrow.nrows;
3515: PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex);
3516: PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1);
3517: PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i);
3518: } else {
3519: c->compressedrow.use = PETSC_FALSE;
3520: c->compressedrow.i = NULL;
3521: c->compressedrow.rindex = NULL;
3522: }
3523: C->nonzerostate = A->nonzerostate;
3525: PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist);
3526: return 0;
3527: }
3529: PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B)
3530: {
3531: MatCreate(PetscObjectComm((PetscObject)A), B);
3532: MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n);
3533: MatSetType(*B, MATSEQBAIJ);
3534: MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE);
3535: return 0;
3536: }
3538: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3539: PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
3540: {
3541: PetscInt header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3542: PetscInt *rowidxs, *colidxs;
3543: PetscScalar *matvals;
3545: PetscViewerSetUp(viewer);
3547: /* read matrix header */
3548: PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT);
3550: M = header[1];
3551: N = header[2];
3552: nz = header[3];
3557: /* set block sizes from the viewer's .info file */
3558: MatLoad_Binary_BlockSizes(mat, viewer);
3559: /* set local and global sizes if not set already */
3560: if (mat->rmap->n < 0) mat->rmap->n = M;
3561: if (mat->cmap->n < 0) mat->cmap->n = N;
3562: if (mat->rmap->N < 0) mat->rmap->N = M;
3563: if (mat->cmap->N < 0) mat->cmap->N = N;
3564: PetscLayoutSetUp(mat->rmap);
3565: PetscLayoutSetUp(mat->cmap);
3567: /* check if the matrix sizes are correct */
3568: MatGetSize(mat, &rows, &cols);
3570: MatGetBlockSize(mat, &bs);
3571: MatGetLocalSize(mat, &m, &n);
3572: mbs = m / bs;
3573: nbs = n / bs;
3575: /* read in row lengths, column indices and nonzero values */
3576: PetscMalloc1(m + 1, &rowidxs);
3577: PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT);
3578: rowidxs[0] = 0;
3579: for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3580: sum = rowidxs[m];
3583: /* read in column indices and nonzero values */
3584: PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals);
3585: PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT);
3586: PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR);
3588: { /* preallocate matrix storage */
3589: PetscBT bt; /* helper bit set to count nonzeros */
3590: PetscInt *nnz;
3591: PetscBool sbaij;
3593: PetscBTCreate(nbs, &bt);
3594: PetscCalloc1(mbs, &nnz);
3595: PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij);
3596: for (i = 0; i < mbs; i++) {
3597: PetscBTMemzero(nbs, bt);
3598: for (k = 0; k < bs; k++) {
3599: PetscInt row = bs * i + k;
3600: for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3601: PetscInt col = colidxs[j];
3602: if (!sbaij || col >= row)
3603: if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3604: }
3605: }
3606: }
3607: PetscBTDestroy(&bt);
3608: MatSeqBAIJSetPreallocation(mat, bs, 0, nnz);
3609: MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz);
3610: PetscFree(nnz);
3611: }
3613: /* store matrix values */
3614: for (i = 0; i < m; i++) {
3615: PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3616: (*mat->ops->setvalues)(mat, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES);
3617: }
3619: PetscFree(rowidxs);
3620: PetscFree2(colidxs, matvals);
3621: MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY);
3622: MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY);
3623: return 0;
3624: }
3626: PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer)
3627: {
3628: PetscBool isbinary;
3630: PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary);
3632: MatLoad_SeqBAIJ_Binary(mat, viewer);
3633: return 0;
3634: }
3636: /*@C
3637: MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3638: compressed row) format. For good matrix assembly performance the
3639: user should preallocate the matrix storage by setting the parameter nz
3640: (or the array nnz). By setting these parameters accurately, performance
3641: during matrix assembly can be increased by more than a factor of 50.
3643: Collective
3645: Input Parameters:
3646: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3647: . bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3648: blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3649: . m - number of rows
3650: . n - number of columns
3651: . nz - number of nonzero blocks per block row (same for all rows)
3652: - nnz - array containing the number of nonzero blocks in the various block rows
3653: (possibly different for each block row) or NULL
3655: Output Parameter:
3656: . A - the matrix
3658: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3659: MatXXXXSetPreallocation() paradigm instead of this routine directly.
3660: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3662: Options Database Keys:
3663: + -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3664: - -mat_block_size - size of the blocks to use
3666: Level: intermediate
3668: Notes:
3669: The number of rows and columns must be divisible by blocksize.
3671: If the nnz parameter is given then the nz parameter is ignored
3673: A nonzero block is any block that as 1 or more nonzeros in it
3675: The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3676: storage. That is, the stored row and column indices can begin at
3677: either one (as in Fortran) or zero. See the users' manual for details.
3679: Specify the preallocated storage with either nz or nnz (not both).
3680: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3681: allocation. See [Sparse Matrices](sec_matsparse) for details.
3682: matrices.
3684: .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3685: @*/
3686: PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3687: {
3688: MatCreate(comm, A);
3689: MatSetSizes(*A, m, n, m, n);
3690: MatSetType(*A, MATSEQBAIJ);
3691: MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz);
3692: return 0;
3693: }
3695: /*@C
3696: MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3697: per row in the matrix. For good matrix assembly performance the
3698: user should preallocate the matrix storage by setting the parameter nz
3699: (or the array nnz). By setting these parameters accurately, performance
3700: during matrix assembly can be increased by more than a factor of 50.
3702: Collective
3704: Input Parameters:
3705: + B - the matrix
3706: . bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3707: blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3708: . nz - number of block nonzeros per block row (same for all rows)
3709: - nnz - array containing the number of block nonzeros in the various block rows
3710: (possibly different for each block row) or NULL
3712: Options Database Keys:
3713: + -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3714: - -mat_block_size - size of the blocks to use
3716: Level: intermediate
3718: Notes:
3719: If the nnz parameter is given then the nz parameter is ignored
3721: You can call `MatGetInfo()` to get information on how effective the preallocation was;
3722: for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3723: You can also run with the option -info and look for messages with the string
3724: malloc in them to see if additional memory allocation was needed.
3726: The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3727: storage. That is, the stored row and column indices can begin at
3728: either one (as in Fortran) or zero. See the users' manual for details.
3730: Specify the preallocated storage with either nz or nnz (not both).
3731: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3732: allocation. See [Sparse Matrices](sec_matsparse) for details.
3734: .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3735: @*/
3736: PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3737: {
3741: PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3742: return 0;
3743: }
3745: /*@C
3746: MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values
3748: Collective
3750: Input Parameters:
3751: + B - the matrix
3752: . i - the indices into j for the start of each local row (starts with zero)
3753: . j - the column indices for each local row (starts with zero) these must be sorted for each row
3754: - v - optional values in the matrix
3756: Level: advanced
3758: Notes:
3759: The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`. For example, C programs
3760: may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3761: over rows within a block and the last index is over columns within a block row. Fortran programs will likely set
3762: `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3763: block column and the second index is over columns within a block.
3765: Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well
3767: .seealso: `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3768: @*/
3769: PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3770: {
3774: PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3775: return 0;
3776: }
3778: /*@
3779: MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.
3781: Collective
3783: Input Parameters:
3784: + comm - must be an MPI communicator of size 1
3785: . bs - size of block
3786: . m - number of rows
3787: . n - number of columns
3788: . i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3789: . j - column indices
3790: - a - matrix values
3792: Output Parameter:
3793: . mat - the matrix
3795: Level: advanced
3797: Notes:
3798: The i, j, and a arrays are not copied by this routine, the user must free these arrays
3799: once the matrix is destroyed
3801: You cannot set new nonzero locations into this matrix, that will generate an error.
3803: The i and j indices are 0 based
3805: When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format
3807: The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3808: the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3809: block, followed by the second column of the first block etc etc. That is, the blocks are contiguous in memory
3810: with column-major ordering within blocks.
3812: .seealso: `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3813: @*/
3814: PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat)
3815: {
3816: PetscInt ii;
3817: Mat_SeqBAIJ *baij;
3822: MatCreate(comm, mat);
3823: MatSetSizes(*mat, m, n, m, n);
3824: MatSetType(*mat, MATSEQBAIJ);
3825: MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL);
3826: baij = (Mat_SeqBAIJ *)(*mat)->data;
3827: PetscMalloc2(m, &baij->imax, m, &baij->ilen);
3829: baij->i = i;
3830: baij->j = j;
3831: baij->a = a;
3833: baij->singlemalloc = PETSC_FALSE;
3834: baij->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3835: baij->free_a = PETSC_FALSE;
3836: baij->free_ij = PETSC_FALSE;
3838: for (ii = 0; ii < m; ii++) {
3839: baij->ilen[ii] = baij->imax[ii] = i[ii + 1] - i[ii];
3841: }
3842: if (PetscDefined(USE_DEBUG)) {
3843: for (ii = 0; ii < baij->i[m]; ii++) {
3846: }
3847: }
3849: MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY);
3850: MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY);
3851: return 0;
3852: }
3854: PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
3855: {
3856: MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat);
3857: return 0;
3858: }