Actual source code: ibcgs.c
2: #include <petsc/private/kspimpl.h>
3: #include <petsc/private/vecimpl.h>
5: static PetscErrorCode KSPSetUp_IBCGS(KSP ksp)
6: {
7: PetscBool diagonalscale;
9: PCGetDiagonalScale(ksp->pc, &diagonalscale);
11: KSPSetWorkVecs(ksp, 9);
12: return 0;
13: }
15: /*
16: The code below "cheats" from PETSc style
17: 1) VecRestoreArray() is called immediately after VecGetArray() and the array values are still accessed; the reason for the immediate
18: restore is that Vec operations are done on some of the vectors during the solve and if we did not restore immediately it would
19: generate two VecGetArray() (the second one inside the Vec operation) calls without a restore between them.
20: 2) The vector operations on done directly on the arrays instead of with VecXXXX() calls
22: For clarity in the code we name single VECTORS with two names, for example, Rn_1 and R, but they actually always
23: the exact same memory. We do this with macro defines so that compiler won't think they are
24: two different variables.
26: */
27: #define Xn_1 Xn
28: #define xn_1 xn
29: #define Rn_1 Rn
30: #define rn_1 rn
31: #define Un_1 Un
32: #define un_1 un
33: #define Vn_1 Vn
34: #define vn_1 vn
35: #define Qn_1 Qn
36: #define qn_1 qn
37: #define Zn_1 Zn
38: #define zn_1 zn
39: static PetscErrorCode KSPSolve_IBCGS(KSP ksp)
40: {
41: PetscInt i, N;
42: PetscReal rnorm = 0.0, rnormin = 0.0;
43: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
44: /* Because of possible instabilities in the algorithm (as indicated by different residual histories for the same problem
45: on the same number of processes with different runs) we support computing the inner products using Intel's 80 bit arithmetic
46: rather than just 64 bit. Thus we copy our double precision values into long doubles (hoping this keeps the 16 extra bits)
47: and tell MPI to do its ALlreduces with MPI_LONG_DOUBLE.
49: Note for developers that does not effect the code. Intel's long double is implemented by storing the 80 bits of extended double
50: precision into a 16 byte space (the rest of the space is ignored) */
51: long double insums[7], outsums[7];
52: #else
53: PetscScalar insums[7], outsums[7];
54: #endif
55: PetscScalar sigman_2, sigman_1, sigman, pin_1, pin, phin_1, phin, tmp1, tmp2;
56: PetscScalar taun_1, taun, rhon, alphan_1, alphan, omegan_1, omegan;
57: const PetscScalar *PETSC_RESTRICT r0, *PETSC_RESTRICT f0, *PETSC_RESTRICT qn, *PETSC_RESTRICT b, *PETSC_RESTRICT un;
58: PetscScalar *PETSC_RESTRICT rn, *PETSC_RESTRICT xn, *PETSC_RESTRICT vn, *PETSC_RESTRICT zn;
59: /* the rest do not have to keep n_1 values */
60: PetscScalar kappan, thetan, etan, gamman, betan, deltan;
61: const PetscScalar *PETSC_RESTRICT tn;
62: PetscScalar *PETSC_RESTRICT sn;
63: Vec R0, Rn, Xn, F0, Vn, Zn, Qn, Tn, Sn, B, Un;
64: Mat A;
68: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
69: /* since 80 bit long doubls do not fill the upper bits, we fill them initially so that
70: valgrind won't detect MPI_Allreduce() with uninitialized data */
71: PetscMemzero(insums, sizeof(insums));
72: PetscMemzero(insums, sizeof(insums));
73: #endif
75: PCGetOperators(ksp->pc, &A, NULL);
76: VecGetLocalSize(ksp->vec_sol, &N);
77: Xn = ksp->vec_sol;
78: VecGetArray(Xn_1, (PetscScalar **)&xn_1);
79: VecRestoreArray(Xn_1, NULL);
80: B = ksp->vec_rhs;
81: VecGetArrayRead(B, (const PetscScalar **)&b);
82: VecRestoreArrayRead(B, NULL);
83: R0 = ksp->work[0];
84: VecGetArrayRead(R0, (const PetscScalar **)&r0);
85: VecRestoreArrayRead(R0, NULL);
86: Rn = ksp->work[1];
87: VecGetArray(Rn_1, (PetscScalar **)&rn_1);
88: VecRestoreArray(Rn_1, NULL);
89: Un = ksp->work[2];
90: VecGetArrayRead(Un_1, (const PetscScalar **)&un_1);
91: VecRestoreArrayRead(Un_1, NULL);
92: F0 = ksp->work[3];
93: VecGetArrayRead(F0, (const PetscScalar **)&f0);
94: VecRestoreArrayRead(F0, NULL);
95: Vn = ksp->work[4];
96: VecGetArray(Vn_1, (PetscScalar **)&vn_1);
97: VecRestoreArray(Vn_1, NULL);
98: Zn = ksp->work[5];
99: VecGetArray(Zn_1, (PetscScalar **)&zn_1);
100: VecRestoreArray(Zn_1, NULL);
101: Qn = ksp->work[6];
102: VecGetArrayRead(Qn_1, (const PetscScalar **)&qn_1);
103: VecRestoreArrayRead(Qn_1, NULL);
104: Tn = ksp->work[7];
105: VecGetArrayRead(Tn, (const PetscScalar **)&tn);
106: VecRestoreArrayRead(Tn, NULL);
107: Sn = ksp->work[8];
108: VecGetArrayRead(Sn, (const PetscScalar **)&sn);
109: VecRestoreArrayRead(Sn, NULL);
111: /* r0 = rn_1 = b - A*xn_1; */
112: /* KSP_PCApplyBAorAB(ksp,Xn_1,Rn_1,Tn);
113: VecAYPX(Rn_1,-1.0,B); */
114: KSPInitialResidual(ksp, Xn_1, Tn, Sn, Rn_1, B);
115: if (ksp->normtype != KSP_NORM_NONE) {
116: VecNorm(Rn_1, NORM_2, &rnorm);
117: KSPCheckNorm(ksp, rnorm);
118: }
119: KSPMonitor(ksp, 0, rnorm);
120: (*ksp->converged)(ksp, 0, rnorm, &ksp->reason, ksp->cnvP);
121: if (ksp->reason) return 0;
123: VecCopy(Rn_1, R0);
125: /* un_1 = A*rn_1; */
126: KSP_PCApplyBAorAB(ksp, Rn_1, Un_1, Tn);
128: /* f0 = A'*rn_1; */
129: if (ksp->pc_side == PC_RIGHT) { /* B' A' */
130: KSP_MatMultTranspose(ksp, A, R0, Tn);
131: KSP_PCApplyTranspose(ksp, Tn, F0);
132: } else if (ksp->pc_side == PC_LEFT) { /* A' B' */
133: KSP_PCApplyTranspose(ksp, R0, Tn);
134: KSP_MatMultTranspose(ksp, A, Tn, F0);
135: }
137: /*qn_1 = vn_1 = zn_1 = 0.0; */
138: VecSet(Qn_1, 0.0);
139: VecSet(Vn_1, 0.0);
140: VecSet(Zn_1, 0.0);
142: sigman_2 = pin_1 = taun_1 = 0.0;
144: /* the paper says phin_1 should be initialized to zero, it is actually R0'R0 */
145: VecDot(R0, R0, &phin_1);
146: KSPCheckDot(ksp, phin_1);
148: /* sigman_1 = rn_1'un_1 */
149: VecDot(R0, Un_1, &sigman_1);
151: alphan_1 = omegan_1 = 1.0;
153: for (ksp->its = 1; ksp->its < ksp->max_it + 1; ksp->its++) {
154: rhon = phin_1 - omegan_1 * sigman_2 + omegan_1 * alphan_1 * pin_1;
155: if (ksp->its == 1) deltan = rhon;
156: else deltan = rhon / taun_1;
157: betan = deltan / omegan_1;
158: taun = sigman_1 + betan * taun_1 - deltan * pin_1;
159: if (taun == 0.0) {
161: ksp->reason = KSP_DIVERGED_NANORINF;
162: return 0;
163: }
164: alphan = rhon / taun;
165: PetscLogFlops(15.0);
167: /*
168: zn = alphan*rn_1 + (alphan/alphan_1)betan*zn_1 - alphan*deltan*vn_1
169: vn = un_1 + betan*vn_1 - deltan*qn_1
170: sn = rn_1 - alphan*vn
172: The algorithm in the paper is missing the alphan/alphan_1 term in the zn update
173: */
174: PetscLogEventBegin(VEC_Ops, 0, 0, 0, 0);
175: tmp1 = (alphan / alphan_1) * betan;
176: tmp2 = alphan * deltan;
177: for (i = 0; i < N; i++) {
178: zn[i] = alphan * rn_1[i] + tmp1 * zn_1[i] - tmp2 * vn_1[i];
179: vn[i] = un_1[i] + betan * vn_1[i] - deltan * qn_1[i];
180: sn[i] = rn_1[i] - alphan * vn[i];
181: }
182: PetscLogFlops(3.0 + 11.0 * N);
183: PetscLogEventEnd(VEC_Ops, 0, 0, 0, 0);
185: /*
186: qn = A*vn
187: */
188: KSP_PCApplyBAorAB(ksp, Vn, Qn, Tn);
190: /*
191: tn = un_1 - alphan*qn
192: */
193: VecWAXPY(Tn, -alphan, Qn, Un_1);
195: /*
196: phin = r0'sn
197: pin = r0'qn
198: gamman = f0'sn
199: etan = f0'tn
200: thetan = sn'tn
201: kappan = tn'tn
202: */
203: PetscLogEventBegin(VEC_ReduceArithmetic, 0, 0, 0, 0);
204: phin = pin = gamman = etan = thetan = kappan = 0.0;
205: for (i = 0; i < N; i++) {
206: phin += r0[i] * sn[i];
207: pin += r0[i] * qn[i];
208: gamman += f0[i] * sn[i];
209: etan += f0[i] * tn[i];
210: thetan += sn[i] * tn[i];
211: kappan += tn[i] * tn[i];
212: }
213: PetscLogFlops(12.0 * N);
214: PetscLogEventEnd(VEC_ReduceArithmetic, 0, 0, 0, 0);
216: insums[0] = phin;
217: insums[1] = pin;
218: insums[2] = gamman;
219: insums[3] = etan;
220: insums[4] = thetan;
221: insums[5] = kappan;
222: insums[6] = rnormin;
224: PetscLogEventBegin(VEC_ReduceCommunication, 0, 0, 0, 0);
225: #if defined(PETSC_HAVE_MPI_LONG_DOUBLE) && !defined(PETSC_USE_COMPLEX) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
226: if (ksp->lagnorm && ksp->its > 1) {
227: MPIU_Allreduce(insums, outsums, 7, MPI_LONG_DOUBLE, MPI_SUM, PetscObjectComm((PetscObject)ksp));
228: } else {
229: MPIU_Allreduce(insums, outsums, 6, MPI_LONG_DOUBLE, MPI_SUM, PetscObjectComm((PetscObject)ksp));
230: }
231: #else
232: if (ksp->lagnorm && ksp->its > 1 && ksp->normtype != KSP_NORM_NONE) {
233: MPIU_Allreduce(insums, outsums, 7, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)ksp));
234: } else {
235: MPIU_Allreduce(insums, outsums, 6, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)ksp));
236: }
237: #endif
238: PetscLogEventEnd(VEC_ReduceCommunication, 0, 0, 0, 0);
239: phin = outsums[0];
240: pin = outsums[1];
241: gamman = outsums[2];
242: etan = outsums[3];
243: thetan = outsums[4];
244: kappan = outsums[5];
245: if (ksp->lagnorm && ksp->its > 1 && ksp->normtype != KSP_NORM_NONE) rnorm = PetscSqrtReal(PetscRealPart(outsums[6]));
247: if (kappan == 0.0) {
249: ksp->reason = KSP_DIVERGED_NANORINF;
250: return 0;
251: }
252: if (thetan == 0.0) {
254: ksp->reason = KSP_DIVERGED_NANORINF;
255: return 0;
256: }
257: omegan = thetan / kappan;
258: sigman = gamman - omegan * etan;
260: /*
261: rn = sn - omegan*tn
262: xn = xn_1 + zn + omegan*sn
263: */
264: PetscLogEventBegin(VEC_Ops, 0, 0, 0, 0);
265: rnormin = 0.0;
266: for (i = 0; i < N; i++) {
267: rn[i] = sn[i] - omegan * tn[i];
268: rnormin += PetscRealPart(PetscConj(rn[i]) * rn[i]);
269: xn[i] += zn[i] + omegan * sn[i];
270: }
271: PetscObjectStateIncrease((PetscObject)Xn);
272: PetscLogFlops(7.0 * N);
273: PetscLogEventEnd(VEC_Ops, 0, 0, 0, 0);
275: if (!ksp->lagnorm && ksp->chknorm < ksp->its && ksp->normtype != KSP_NORM_NONE) {
276: PetscLogEventBegin(VEC_ReduceCommunication, 0, 0, 0, 0);
277: MPIU_Allreduce(&rnormin, &rnorm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)ksp));
278: PetscLogEventEnd(VEC_ReduceCommunication, 0, 0, 0, 0);
279: rnorm = PetscSqrtReal(rnorm);
280: }
282: /* Test for convergence */
283: KSPMonitor(ksp, ksp->its, rnorm);
284: (*ksp->converged)(ksp, ksp->its, rnorm, &ksp->reason, ksp->cnvP);
285: if (ksp->reason) {
286: KSPUnwindPreconditioner(ksp, Xn, Tn);
287: return 0;
288: }
290: /* un = A*rn */
291: KSP_PCApplyBAorAB(ksp, Rn, Un, Tn);
293: /* Update n-1 locations with n locations */
294: sigman_2 = sigman_1;
295: sigman_1 = sigman;
296: pin_1 = pin;
297: phin_1 = phin;
298: alphan_1 = alphan;
299: taun_1 = taun;
300: omegan_1 = omegan;
301: }
302: if (ksp->its >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
303: KSPUnwindPreconditioner(ksp, Xn, Tn);
304: return 0;
305: }
307: /*MC
308: KSPIBCGS - Implements the IBiCGStab (Improved Stabilized version of BiConjugate Gradient) method
309: in an alternative form to have only a single global reduction operation instead of the usual 3 (or 4)
311: Level: beginner
313: Notes:
314: Supports left and right preconditioning
316: See `KSPBCGSL` for additional stabilization
318: Unlike the Bi-CG-stab algorithm, this requires one multiplication be the transpose of the operator
319: before the iteration starts.
321: The paper has two errors in the algorithm presented, they are fixed in the code in `KSPSolve_IBCGS()`
323: For maximum reduction in the number of global reduction operations, this solver should be used with
324: `KSPSetLagNorm()`.
326: This is not supported for complex numbers.
328: Reference:
329: The Improved BiCGStab Method for Large and Sparse Unsymmetric Linear Systems on Parallel Distributed Memory
330: Architectures. L. T. Yang and R. Brent, Proceedings of the Fifth International Conference on Algorithms and
331: Architectures for Parallel Processing, 2002, IEEE.
333: .seealso: [](chapter_ksp), `KSPCreate()`, `KSPSetType()`, `KSPType`, `KSP`, `KSPBICG`, `KSPBCGSL`, `KSPIBCGS`, `KSPSetLagNorm()`
334: M*/
336: PETSC_EXTERN PetscErrorCode KSPCreate_IBCGS(KSP ksp)
337: {
339: KSPSetSupportedNorm(ksp, KSP_NORM_PRECONDITIONED, PC_LEFT, 3);
340: KSPSetSupportedNorm(ksp, KSP_NORM_UNPRECONDITIONED, PC_RIGHT, 2);
341: KSPSetSupportedNorm(ksp, KSP_NORM_NONE, PC_RIGHT, 1);
343: ksp->ops->setup = KSPSetUp_IBCGS;
344: ksp->ops->solve = KSPSolve_IBCGS;
345: ksp->ops->destroy = KSPDestroyDefault;
346: ksp->ops->buildsolution = KSPBuildSolutionDefault;
347: ksp->ops->buildresidual = KSPBuildResidualDefault;
348: ksp->ops->setfromoptions = NULL;
349: ksp->ops->view = NULL;
350: #if defined(PETSC_USE_COMPLEX)
351: SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_SUP, "This is not supported for complex numbers");
352: #else
353: return 0;
354: #endif
355: }