Actual source code: fftw.c

  1: /*
  2:     Provides an interface to the FFTW package.
  3:     Testing examples can be found in ~src/mat/tests
  4: */

  6: #include <../src/mat/impls/fft/fft.h>
  7: EXTERN_C_BEGIN
  8: #if !PetscDefined(HAVE_MPIUNI)
  9:   #include <fftw3-mpi.h>
 10: #else
 11:   #include <fftw3.h>
 12: #endif
 13: EXTERN_C_END

 15: typedef struct {
 16:   PetscInt   ndim_fftw;
 17:   ptrdiff_t *dim_fftw;
 18: #if defined(PETSC_USE_64BIT_INDICES)
 19:   fftw_iodim64 *iodims;
 20: #else
 21:   fftw_iodim *iodims;
 22: #endif
 23:   PetscInt     partial_dim;
 24:   fftw_plan    p_forward, p_backward;
 25:   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 26:   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw_execute() can only be executed for the arrays with which the plan was created */
 27: } Mat_FFTW;

 29: extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);

 31: static PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
 32: {
 33:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
 34:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
 35:   const PetscScalar *x_array;
 36:   PetscScalar       *y_array;
 37:   Vec                xx;
 38: #if defined(PETSC_USE_COMPLEX)
 39:   #if defined(PETSC_USE_64BIT_INDICES)
 40:   fftw_iodim64 *iodims;
 41:   #else
 42:   fftw_iodim *iodims;
 43:   #endif
 44: #endif
 45:   PetscInt ndim = fft->ndim, *dim = fft->dim;

 47:   PetscFunctionBegin;
 48:   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
 49:     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
 50:     PetscCall(VecDuplicate(x, &xx));
 51:     PetscCall(VecGetArrayRead(xx, &x_array));
 52:   } else {
 53:     PetscCall(VecGetArrayRead(x, &x_array));
 54:   }
 55:   PetscCall(VecGetArray(y, &y_array));
 56:   if (!fftw->p_forward) { /* create a plan, then execute it */
 57:     switch (ndim) {
 58:     case 1:
 59: #if defined(PETSC_USE_COMPLEX)
 60:       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 61: #else
 62:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 63: #endif
 64:       break;
 65:     case 2:
 66: #if defined(PETSC_USE_COMPLEX)
 67:       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 68: #else
 69:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 70: #endif
 71:       break;
 72:     case 3:
 73: #if defined(PETSC_USE_COMPLEX)
 74:       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 75: #else
 76:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 77: #endif
 78:       break;
 79:     default:
 80: #if defined(PETSC_USE_COMPLEX)
 81:       iodims = fftw->iodims;
 82:   #if defined(PETSC_USE_64BIT_INDICES)
 83:       if (ndim) {
 84:         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
 85:         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
 86:         for (PetscInt i = ndim - 2; i >= 0; --i) {
 87:           iodims[i].n  = (ptrdiff_t)dim[i];
 88:           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
 89:         }
 90:       }
 91:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 92:   #else
 93:       if (ndim) {
 94:         iodims[ndim - 1].n  = (int)dim[ndim - 1];
 95:         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
 96:         for (PetscInt i = ndim - 2; i >= 0; --i) {
 97:           iodims[i].n  = (int)dim[i];
 98:           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
 99:         }
100:       }
101:       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
102:   #endif

104: #else
105:       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
106: #endif
107:       break;
108:     }
109:     if (fftw->p_flag != FFTW_ESTIMATE) {
110:       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
111:       PetscCall(VecRestoreArrayRead(xx, &x_array));
112:       PetscCall(VecDestroy(&xx));
113:       PetscCall(VecGetArrayRead(x, &x_array));
114:     } else {
115:       fftw->finarray  = (PetscScalar *)x_array;
116:       fftw->foutarray = y_array;
117:     }
118:   }

120:   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
121: #if defined(PETSC_USE_COMPLEX)
122:     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
123: #else
124:     fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
125: #endif
126:   } else {
127:     fftw_execute(fftw->p_forward);
128:   }
129:   PetscCall(VecRestoreArray(y, &y_array));
130:   PetscCall(VecRestoreArrayRead(x, &x_array));
131:   PetscFunctionReturn(PETSC_SUCCESS);
132: }

134: static PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
135: {
136:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
137:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
138:   const PetscScalar *x_array;
139:   PetscScalar       *y_array;
140:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
141:   Vec                xx;
142: #if defined(PETSC_USE_COMPLEX)
143:   #if defined(PETSC_USE_64BIT_INDICES)
144:   fftw_iodim64 *iodims = fftw->iodims;
145:   #else
146:   fftw_iodim *iodims = fftw->iodims;
147:   #endif
148: #endif

150:   PetscFunctionBegin;
151:   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
152:     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
153:     PetscCall(VecDuplicate(x, &xx));
154:     PetscCall(VecGetArrayRead(xx, &x_array));
155:   } else {
156:     PetscCall(VecGetArrayRead(x, &x_array));
157:   }
158:   PetscCall(VecGetArray(y, &y_array));
159:   if (!fftw->p_backward) { /* create a plan, then execute it */
160:     switch (ndim) {
161:     case 1:
162: #if defined(PETSC_USE_COMPLEX)
163:       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
164: #else
165:       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
166: #endif
167:       break;
168:     case 2:
169: #if defined(PETSC_USE_COMPLEX)
170:       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
171: #else
172:       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
173: #endif
174:       break;
175:     case 3:
176: #if defined(PETSC_USE_COMPLEX)
177:       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
178: #else
179:       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
180: #endif
181:       break;
182:     default:
183: #if defined(PETSC_USE_COMPLEX)
184:   #if defined(PETSC_USE_64BIT_INDICES)
185:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186:   #else
187:       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
188:   #endif
189: #else
190:       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
191: #endif
192:       break;
193:     }
194:     if (fftw->p_flag != FFTW_ESTIMATE) {
195:       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
196:       PetscCall(VecRestoreArrayRead(xx, &x_array));
197:       PetscCall(VecDestroy(&xx));
198:       PetscCall(VecGetArrayRead(x, &x_array));
199:     } else {
200:       fftw->binarray  = (PetscScalar *)x_array;
201:       fftw->boutarray = y_array;
202:     }
203:   }
204:   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
205: #if defined(PETSC_USE_COMPLEX)
206:     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
207: #else
208:     fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
209: #endif
210:   } else {
211:     fftw_execute(fftw->p_backward);
212:   }
213:   PetscCall(VecRestoreArray(y, &y_array));
214:   PetscCall(VecRestoreArrayRead(x, &x_array));
215:   PetscFunctionReturn(PETSC_SUCCESS);
216: }

218: #if !PetscDefined(HAVE_MPIUNI)
219: static PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
220: {
221:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
222:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
223:   const PetscScalar *x_array;
224:   PetscScalar       *y_array;
225:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
226:   MPI_Comm           comm;
227:   Vec                xx;

229:   PetscFunctionBegin;
230:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
231:   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
232:     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
233:     PetscCall(VecDuplicate(x, &xx));
234:     PetscCall(VecGetArrayRead(xx, &x_array));
235:   } else {
236:     PetscCall(VecGetArrayRead(x, &x_array));
237:   }
238:   PetscCall(VecGetArray(y, &y_array));
239:   if (!fftw->p_forward) { /* create a plan, then execute it */
240:     switch (ndim) {
241:     case 1:
242:   #if defined(PETSC_USE_COMPLEX)
243:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
244:   #else
245:       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
246:   #endif
247:       break;
248:     case 2:
249:   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
250:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
251:   #else
252:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
253:   #endif
254:       break;
255:     case 3:
256:   #if defined(PETSC_USE_COMPLEX)
257:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
258:   #else
259:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
260:   #endif
261:       break;
262:     default:
263:   #if defined(PETSC_USE_COMPLEX)
264:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
265:   #else
266:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
267:   #endif
268:       break;
269:     }
270:     if (fftw->p_flag != FFTW_ESTIMATE) {
271:       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
272:       PetscCall(VecRestoreArrayRead(xx, &x_array));
273:       PetscCall(VecDestroy(&xx));
274:       PetscCall(VecGetArrayRead(x, &x_array));
275:     } else {
276:       fftw->finarray  = (PetscScalar *)x_array;
277:       fftw->foutarray = y_array;
278:     }
279:   }
280:   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
281:     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
282:   } else {
283:     fftw_execute(fftw->p_forward);
284:   }
285:   PetscCall(VecRestoreArray(y, &y_array));
286:   PetscCall(VecRestoreArrayRead(x, &x_array));
287:   PetscFunctionReturn(PETSC_SUCCESS);
288: }

290: static PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
291: {
292:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
293:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
294:   const PetscScalar *x_array;
295:   PetscScalar       *y_array;
296:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
297:   MPI_Comm           comm;
298:   Vec                xx;

300:   PetscFunctionBegin;
301:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
302:   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
303:     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
304:     PetscCall(VecDuplicate(x, &xx));
305:     PetscCall(VecGetArrayRead(xx, &x_array));
306:   } else {
307:     PetscCall(VecGetArrayRead(x, &x_array));
308:   }
309:   PetscCall(VecGetArray(y, &y_array));
310:   if (!fftw->p_backward) { /* create a plan, then execute it */
311:     switch (ndim) {
312:     case 1:
313:   #if defined(PETSC_USE_COMPLEX)
314:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
315:   #else
316:       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
317:   #endif
318:       break;
319:     case 2:
320:   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
322:   #else
323:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
324:   #endif
325:       break;
326:     case 3:
327:   #if defined(PETSC_USE_COMPLEX)
328:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
329:   #else
330:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
331:   #endif
332:       break;
333:     default:
334:   #if defined(PETSC_USE_COMPLEX)
335:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
336:   #else
337:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
338:   #endif
339:       break;
340:     }
341:     if (fftw->p_flag != FFTW_ESTIMATE) {
342:       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
343:       PetscCall(VecRestoreArrayRead(xx, &x_array));
344:       PetscCall(VecDestroy(&xx));
345:       PetscCall(VecGetArrayRead(x, &x_array));
346:     } else {
347:       fftw->binarray  = (PetscScalar *)x_array;
348:       fftw->boutarray = y_array;
349:     }
350:   }
351:   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
352:     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
353:   } else {
354:     fftw_execute(fftw->p_backward);
355:   }
356:   PetscCall(VecRestoreArray(y, &y_array));
357:   PetscCall(VecRestoreArrayRead(x, &x_array));
358:   PetscFunctionReturn(PETSC_SUCCESS);
359: }
360: #endif

362: static PetscErrorCode MatDestroy_FFTW(Mat A)
363: {
364:   Mat_FFT  *fft  = (Mat_FFT *)A->data;
365:   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;

367:   PetscFunctionBegin;
368:   fftw_destroy_plan(fftw->p_forward);
369:   fftw_destroy_plan(fftw->p_backward);
370:   if (fftw->iodims) free(fftw->iodims);
371:   PetscCall(PetscFree(fftw->dim_fftw));
372:   PetscCall(PetscFree(fft->data));
373:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
374:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
375:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
376: #if !PetscDefined(HAVE_MPIUNI)
377:   fftw_mpi_cleanup();
378: #endif
379:   PetscFunctionReturn(PETSC_SUCCESS);
380: }

382: #if !PetscDefined(HAVE_MPIUNI)
383: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
384: static PetscErrorCode VecDestroy_MPIFFTW(Vec v)
385: {
386:   PetscScalar *array;

388:   PetscFunctionBegin;
389:   PetscCall(VecGetArray(v, &array));
390:   fftw_free((fftw_complex *)array);
391:   PetscCall(VecRestoreArray(v, &array));
392:   PetscCall(VecDestroy_MPI(v));
393:   PetscFunctionReturn(PETSC_SUCCESS);
394: }
395: #endif

397: #if !PetscDefined(HAVE_MPIUNI)
398: static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
399: {
400:   Mat A;

402:   PetscFunctionBegin;
403:   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
404:   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
405:   PetscFunctionReturn(PETSC_SUCCESS);
406: }

408: static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
409: {
410:   Mat A;

412:   PetscFunctionBegin;
413:   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
414:   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
415:   PetscFunctionReturn(PETSC_SUCCESS);
416: }

418: static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
419: {
420:   Mat A;

422:   PetscFunctionBegin;
423:   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
424:   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
425:   PetscFunctionReturn(PETSC_SUCCESS);
426: }
427: #endif

429: /*@
430:   MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
431:   parallel layout determined by `MATFFTW`

433:   Collective

435:   Input Parameter:
436: . A - the matrix

438:   Output Parameters:
439: + x - (optional) input vector of forward FFTW
440: . y - (optional) output vector of forward FFTW
441: - z - (optional) output vector of backward FFTW

443:   Options Database Key:
444: . -mat_fftw_plannerflags - set FFTW planner flags

446:   Level: advanced

448:   Notes:
449:   The parallel layout of output of forward FFTW is always same as the input
450:   of the backward FFTW. But parallel layout of the input vector of forward
451:   FFTW might not be same as the output of backward FFTW.

453:   We need to provide enough space while doing parallel real transform.
454:   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
455:   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
456:   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
457:   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
458:   zeros if it is odd only one zero is needed.

460:   Lastly one needs some scratch space at the end of data set in each process. alloc_local
461:   figures out how much space is needed, i.e. it figures out the data+scratch space for
462:   each processor and returns that.

464:   Developer Notes:
465:   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?

467: .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
468: @*/
469: PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
470: {
471:   PetscFunctionBegin;
472:   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
473:   PetscFunctionReturn(PETSC_SUCCESS);
474: }

476: PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
477: {
478:   PetscMPIInt size, rank;
479:   MPI_Comm    comm;
480:   Mat_FFT    *fft = (Mat_FFT *)A->data;

482:   PetscFunctionBegin;
485:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));

487:   PetscCallMPI(MPI_Comm_size(comm, &size));
488:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
489:   if (size == 1) { /* sequential case */
490: #if defined(PETSC_USE_COMPLEX)
491:     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
492:     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
493:     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
494: #else
495:     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
496:     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
497:     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
498: #endif
499: #if !PetscDefined(HAVE_MPIUNI)
500:   } else { /* parallel cases */
501:     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
502:     PetscInt      ndim = fft->ndim, *dim = fft->dim;
503:     ptrdiff_t     alloc_local, local_n0, local_0_start;
504:     ptrdiff_t     local_n1;
505:     fftw_complex *data_fout;
506:     ptrdiff_t     local_1_start;
507:     PetscInt      n1, N1;
508:   #if defined(PETSC_USE_COMPLEX)
509:     fftw_complex *data_fin, *data_bout;
510:   #else
511:     double   *data_finr, *data_boutr;
512:     ptrdiff_t temp;
513:   #endif

515:     switch (ndim) {
516:     case 1:
517:   #if !defined(PETSC_USE_COMPLEX)
518:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
519:   #else
520:       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
521:       if (fin) {
522:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
523:         PetscCall(PetscIntCast(local_n0, &n1));
524:         PetscCall(PetscIntCast(fft->N, &N1));
525:         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_fin, fin));
526:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
527:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
528:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
529:       }
530:       if (fout) {
531:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
532:         PetscCall(PetscIntCast(local_n1, &n1));
533:         PetscCall(PetscIntCast(fft->N, &N1));
534:         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_fout, fout));
535:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
536:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
537:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
538:       }
539:       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
540:       if (bout) {
541:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
542:         PetscCall(PetscIntCast(local_n1, &n1));
543:         PetscCall(PetscIntCast(fft->N, &N1));
544:         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (const PetscScalar *)data_bout, bout));
545:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
546:         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
547:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
548:       }
549:       break;
550:   #endif
551:     case 2:
552:   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
553:       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
554:       PetscCall(PetscIntCast(2 * dim[0] * (dim[1] / 2 + 1), &N1));
555:       PetscCall(PetscIntCast(2 * local_n0 * (dim[1] / 2 + 1), &n1));
556:       if (fin) {
557:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
558:         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
559:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
560:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
561:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
562:       }
563:       if (fout) {
564:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
565:         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
566:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
567:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
568:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
569:       }
570:       if (bout) {
571:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
572:         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
573:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
574:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
575:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
576:       }
577:   #else
578:       /* Get local size */
579:       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
580:       if (fin) {
581:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
582:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
583:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
584:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
585:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
586:       }
587:       if (fout) {
588:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
589:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
590:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
591:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
592:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
593:       }
594:       if (bout) {
595:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
596:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
597:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
598:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
599:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
600:       }
601:   #endif
602:       break;
603:     case 3:
604:   #if !defined(PETSC_USE_COMPLEX)
605:       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
606:       PetscCall(PetscIntCast(2 * dim[0] * dim[1] * (dim[2] / 2 + 1), &N1));
607:       PetscCall(PetscIntCast(2 * local_n0 * dim[1] * (dim[2] / 2 + 1), &n1));
608:       if (fin) {
609:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
610:         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
611:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
612:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
613:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
614:       }
615:       if (fout) {
616:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
617:         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
618:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
619:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
620:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
621:       }
622:       if (bout) {
623:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
624:         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
625:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
626:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
627:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
628:       }
629:   #else
630:       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
631:       if (fin) {
632:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
633:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
634:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
635:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
636:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
637:       }
638:       if (fout) {
639:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
640:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
641:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
642:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
643:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
644:       }
645:       if (bout) {
646:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
647:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
648:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
649:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
650:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
651:       }
652:   #endif
653:       break;
654:     default:
655:   #if !defined(PETSC_USE_COMPLEX)
656:       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
657:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
658:       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
659:       N1                                    = 2 * fft->N * (PetscInt)(fftw->dim_fftw[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
660:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

662:       if (fin) {
663:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
664:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
665:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
666:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
667:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
668:       }
669:       if (fout) {
670:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
671:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
672:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
673:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
674:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
675:       }
676:       if (bout) {
677:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
678:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
679:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
680:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
681:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
682:       }
683:   #else
684:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
685:       if (fin) {
686:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
687:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
688:         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
689:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
690:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
691:       }
692:       if (fout) {
693:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
694:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
695:         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
696:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
697:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
698:       }
699:       if (bout) {
700:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
701:         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
702:         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
703:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
704:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
705:       }
706:   #endif
707:       break;
708:     }
709:     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
710:        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
711:        memory leaks. We void these pointers here to avoid accident uses.
712:     */
713:     if (fin) (*fin)->ops->replacearray = NULL;
714:     if (fout) (*fout)->ops->replacearray = NULL;
715:     if (bout) (*bout)->ops->replacearray = NULL;
716: #endif
717:   }
718:   PetscFunctionReturn(PETSC_SUCCESS);
719: }

721: /*@
722:   VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.

724:   Collective

726:   Input Parameters:
727: + A - FFTW matrix
728: - x - the PETSc vector

730:   Output Parameter:
731: . y - the FFTW vector

733:   Level: intermediate

735:   Note:
736:   For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
737:   one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
738:   zeros. This routine does that job by scattering operation.

740: .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
741: @*/
742: PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
743: {
744:   PetscFunctionBegin;
745:   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
746:   PetscFunctionReturn(PETSC_SUCCESS);
747: }

749: static PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
750: {
751:   MPI_Comm    comm;
752:   Mat_FFT    *fft = (Mat_FFT *)A->data;
753:   PetscInt    low;
754:   PetscMPIInt rank, size;
755:   PetscInt    vsize, vsize1;
756:   VecScatter  vecscat;
757:   IS          list1;

759:   PetscFunctionBegin;
760:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
761:   PetscCallMPI(MPI_Comm_size(comm, &size));
762:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
763:   PetscCall(VecGetOwnershipRange(y, &low, NULL));

765:   if (size == 1) {
766:     PetscCall(VecGetSize(x, &vsize));
767:     PetscCall(VecGetSize(y, &vsize1));
768:     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
769:     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
770:     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
771:     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
772:     PetscCall(VecScatterDestroy(&vecscat));
773:     PetscCall(ISDestroy(&list1));
774: #if !PetscDefined(HAVE_MPIUNI)
775:   } else {
776:     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
777:     PetscInt  ndim = fft->ndim, *dim = fft->dim, n1;
778:     ptrdiff_t local_n0, local_0_start;
779:     ptrdiff_t local_n1, local_1_start;
780:     IS        list2;
781:   #if !defined(PETSC_USE_COMPLEX)
782:     PetscInt  i, j, k, partial_dim;
783:     PetscInt *indx1, *indx2, tempindx, tempindx1;
784:     PetscInt  NM;
785:     ptrdiff_t temp;
786:   #else
787:     PetscInt nstart, nlow;
788:   #endif

790:     switch (ndim) {
791:     case 1:
792:   #if defined(PETSC_USE_COMPLEX)
793:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);

795:       PetscCall(PetscIntCast(local_n0, &n1));
796:       PetscCall(PetscIntCast(local_0_start, &nstart));
797:       PetscCall(PetscIntCast(low, &nlow));
798:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
799:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
800:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
801:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
802:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
803:       PetscCall(VecScatterDestroy(&vecscat));
804:       PetscCall(ISDestroy(&list1));
805:       PetscCall(ISDestroy(&list2));
806:   #else
807:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
808:   #endif
809:       break;
810:     case 2:
811:   #if defined(PETSC_USE_COMPLEX)
812:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);

814:       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
815:       PetscCall(PetscIntCast(local_0_start * dim[1], &nstart));
816:       PetscCall(PetscIntCast(low, &nlow));
817:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
818:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
819:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
820:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
821:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
822:       PetscCall(VecScatterDestroy(&vecscat));
823:       PetscCall(ISDestroy(&list1));
824:       PetscCall(ISDestroy(&list2));
825:   #else
826:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

828:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
829:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));

831:       if (dim[1] % 2 == 0) {
832:         NM = dim[1] + 2;
833:       } else {
834:         NM = dim[1] + 1;
835:       }
836:       for (i = 0; i < local_n0; i++) {
837:         for (j = 0; j < dim[1]; j++) {
838:           tempindx  = i * dim[1] + j;
839:           tempindx1 = i * NM + j;

841:           PetscCall(PetscIntCast(local_0_start * dim[1] + tempindx, &indx1[tempindx]));
842:           indx2[tempindx] = low + tempindx1;
843:         }
844:       }

846:       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
847:       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
848:       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));

850:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
851:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
852:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
853:       PetscCall(VecScatterDestroy(&vecscat));
854:       PetscCall(ISDestroy(&list1));
855:       PetscCall(ISDestroy(&list2));
856:       PetscCall(PetscFree(indx1));
857:       PetscCall(PetscFree(indx2));
858:   #endif
859:       break;

861:     case 3:
862:   #if defined(PETSC_USE_COMPLEX)
863:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

865:       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
866:       PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2], &nstart));
867:       PetscCall(PetscIntCast(low, &nlow));
868:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
869:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
870:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
871:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
872:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
873:       PetscCall(VecScatterDestroy(&vecscat));
874:       PetscCall(ISDestroy(&list1));
875:       PetscCall(ISDestroy(&list2));
876:   #else
877:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
878:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
879:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

881:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
882:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));

884:       if (dim[2] % 2 == 0) NM = dim[2] + 2;
885:       else NM = dim[2] + 1;

887:       for (i = 0; i < local_n0; i++) {
888:         for (j = 0; j < dim[1]; j++) {
889:           for (k = 0; k < dim[2]; k++) {
890:             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
891:             tempindx1 = i * dim[1] * NM + j * NM + k;

893:             PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2] + tempindx, &indx1[tempindx]));
894:             PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
895:           }
896:         }
897:       }

899:       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
900:       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
901:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
902:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
903:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
904:       PetscCall(VecScatterDestroy(&vecscat));
905:       PetscCall(ISDestroy(&list1));
906:       PetscCall(ISDestroy(&list2));
907:       PetscCall(PetscFree(indx1));
908:       PetscCall(PetscFree(indx2));
909:   #endif
910:       break;

912:     default:
913:   #if defined(PETSC_USE_COMPLEX)
914:       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);

916:       PetscCall(PetscIntCast(local_n0 * (fftw->partial_dim), &n1));
917:       PetscCall(PetscIntCast(local_0_start * (fftw->partial_dim), &nstart));
918:       PetscCall(PetscIntCast(low, &nlow));
919:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
920:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
921:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
922:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
923:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
924:       PetscCall(VecScatterDestroy(&vecscat));
925:       PetscCall(ISDestroy(&list1));
926:       PetscCall(ISDestroy(&list2));
927:   #else
928:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
929:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
930:       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];

932:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;

934:       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

936:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

938:       partial_dim = fftw->partial_dim;

940:       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
941:       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));

943:       if (dim[ndim - 1] % 2 == 0) NM = 2;
944:       else NM = 1;

946:       j = low;
947:       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
948:         indx1[i] = local_0_start * partial_dim + i;
949:         indx2[i] = j;
950:         if (k % dim[ndim - 1] == 0) j += NM;
951:         j++;
952:       }
953:       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
954:       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
955:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
956:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
957:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
958:       PetscCall(VecScatterDestroy(&vecscat));
959:       PetscCall(ISDestroy(&list1));
960:       PetscCall(ISDestroy(&list2));
961:       PetscCall(PetscFree(indx1));
962:       PetscCall(PetscFree(indx2));
963:   #endif
964:       break;
965:     }
966: #endif
967:   }
968:   PetscFunctionReturn(PETSC_SUCCESS);
969: }

971: /*@
972:   VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.

974:   Collective

976:   Input Parameters:
977: + A - `MATFFTW` matrix
978: - x - FFTW vector

980:   Output Parameter:
981: . y - PETSc vector

983:   Level: intermediate

985:   Note:
986:   While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
987:   `VecScatterFFTWToPetsc()` removes those extra zeros.

989: .seealso: [](ch_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
990: @*/
991: PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
992: {
993:   PetscFunctionBegin;
994:   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
995:   PetscFunctionReturn(PETSC_SUCCESS);
996: }

998: static PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
999: {
1000:   MPI_Comm    comm;
1001:   Mat_FFT    *fft = (Mat_FFT *)A->data;
1002:   PetscInt    low;
1003:   PetscMPIInt rank, size;
1004:   VecScatter  vecscat;
1005:   IS          list1;

1007:   PetscFunctionBegin;
1008:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1009:   PetscCallMPI(MPI_Comm_size(comm, &size));
1010:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1011:   PetscCall(VecGetOwnershipRange(x, &low, NULL));

1013:   if (size == 1) {
1014:     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
1015:     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
1016:     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1017:     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1018:     PetscCall(VecScatterDestroy(&vecscat));
1019:     PetscCall(ISDestroy(&list1));

1021: #if !PetscDefined(HAVE_MPIUNI)
1022:   } else {
1023:     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
1024:     PetscInt  ndim = fft->ndim, *dim = fft->dim;
1025:     ptrdiff_t local_n0, local_0_start;
1026:     ptrdiff_t local_n1, local_1_start;
1027:     IS        list2;
1028:   #if !defined(PETSC_USE_COMPLEX)
1029:     PetscInt  i, j, k, partial_dim;
1030:     PetscInt *indx1, *indx2, tempindx, tempindx1;
1031:     PetscInt  NM;
1032:     ptrdiff_t temp;
1033:   #else
1034:     PetscInt nlow, nstart;
1035:   #endif
1036:     PetscInt n1;
1037:     switch (ndim) {
1038:     case 1:
1039:   #if defined(PETSC_USE_COMPLEX)
1040:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);

1042:       PetscCall(PetscIntCast(local_n1, &n1));
1043:       PetscCall(PetscIntCast(local_1_start, &nstart));
1044:       PetscCall(PetscIntCast(low, &nlow));
1045:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1046:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1047:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1048:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1049:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1050:       PetscCall(VecScatterDestroy(&vecscat));
1051:       PetscCall(ISDestroy(&list1));
1052:       PetscCall(ISDestroy(&list2));
1053:   #else
1054:       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1055:   #endif
1056:       break;
1057:     case 2:
1058:   #if defined(PETSC_USE_COMPLEX)
1059:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);

1061:       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
1062:       PetscCall(PetscIntCast(local_0_start * dim[1], &nstart));
1063:       PetscCall(PetscIntCast(low, &nlow));
1064:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1065:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1066:       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1067:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1068:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1069:       PetscCall(VecScatterDestroy(&vecscat));
1070:       PetscCall(ISDestroy(&list1));
1071:       PetscCall(ISDestroy(&list2));
1072:   #else
1073:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1075:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
1076:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));

1078:       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1079:       else NM = dim[1] + 1;

1081:       for (i = 0; i < local_n0; i++) {
1082:         for (j = 0; j < dim[1]; j++) {
1083:           tempindx  = i * dim[1] + j;
1084:           tempindx1 = i * NM + j;

1086:           PetscCall(PetscIntCast(local_0_start * dim[1] + tempindx, &indx1[tempindx]));
1087:           PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
1088:         }
1089:       }

1091:       PetscCall(PetscIntCast(local_n0 * dim[1], &n1));
1092:       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1093:       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));

1095:       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1096:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1097:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1098:       PetscCall(VecScatterDestroy(&vecscat));
1099:       PetscCall(ISDestroy(&list1));
1100:       PetscCall(ISDestroy(&list2));
1101:       PetscCall(PetscFree(indx1));
1102:       PetscCall(PetscFree(indx2));
1103:   #endif
1104:       break;
1105:     case 3:
1106:   #if defined(PETSC_USE_COMPLEX)
1107:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

1109:       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
1110:       PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2], &nstart));
1111:       PetscCall(PetscIntCast(low, &nlow));
1112:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1113:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1114:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1115:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1116:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1117:       PetscCall(VecScatterDestroy(&vecscat));
1118:       PetscCall(ISDestroy(&list1));
1119:       PetscCall(ISDestroy(&list2));
1120:   #else
1121:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1123:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
1124:       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));

1126:       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1127:       else NM = dim[2] + 1;

1129:       for (i = 0; i < local_n0; i++) {
1130:         for (j = 0; j < dim[1]; j++) {
1131:           for (k = 0; k < dim[2]; k++) {
1132:             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1133:             tempindx1 = i * dim[1] * NM + j * NM + k;

1135:             PetscCall(PetscIntCast(local_0_start * dim[1] * dim[2] + tempindx, &indx1[tempindx]));
1136:             PetscCall(PetscIntCast(low + tempindx1, &indx2[tempindx]));
1137:           }
1138:         }
1139:       }

1141:       PetscCall(PetscIntCast(local_n0 * dim[1] * dim[2], &n1));
1142:       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1143:       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));

1145:       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1146:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1147:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1148:       PetscCall(VecScatterDestroy(&vecscat));
1149:       PetscCall(ISDestroy(&list1));
1150:       PetscCall(ISDestroy(&list2));
1151:       PetscCall(PetscFree(indx1));
1152:       PetscCall(PetscFree(indx2));
1153:   #endif
1154:       break;
1155:     default:
1156:   #if defined(PETSC_USE_COMPLEX)
1157:       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);

1159:       PetscCall(PetscIntCast(local_n0 * (fftw->partial_dim), &n1));
1160:       PetscCall(PetscIntCast(local_0_start * (fftw->partial_dim), &nstart));
1161:       PetscCall(PetscIntCast(low, &nlow));
1162:       PetscCall(ISCreateStride(comm, n1, nstart, 1, &list1));
1163:       PetscCall(ISCreateStride(comm, n1, nlow, 1, &list2));
1164:       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1165:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1166:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1167:       PetscCall(VecScatterDestroy(&vecscat));
1168:       PetscCall(ISDestroy(&list1));
1169:       PetscCall(ISDestroy(&list2));
1170:   #else
1171:       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];

1173:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;

1175:       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1177:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

1179:       partial_dim = fftw->partial_dim;

1181:       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
1182:       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));

1184:       if (dim[ndim - 1] % 2 == 0) NM = 2;
1185:       else NM = 1;

1187:       j = low;
1188:       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
1189:         PetscCall(PetscIntCast(local_0_start * partial_dim + i, &indx1[i]));
1190:         indx2[i] = j;
1191:         if (k % dim[ndim - 1] == 0) j += NM;
1192:         j++;
1193:       }
1194:       PetscCall(PetscIntCast(local_n0 * partial_dim, &n1));
1195:       PetscCall(ISCreateGeneral(comm, n1, indx1, PETSC_COPY_VALUES, &list1));
1196:       PetscCall(ISCreateGeneral(comm, n1, indx2, PETSC_COPY_VALUES, &list2));

1198:       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1199:       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1200:       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1201:       PetscCall(VecScatterDestroy(&vecscat));
1202:       PetscCall(ISDestroy(&list1));
1203:       PetscCall(ISDestroy(&list2));
1204:       PetscCall(PetscFree(indx1));
1205:       PetscCall(PetscFree(indx2));
1206:   #endif
1207:       break;
1208:     }
1209: #endif
1210:   }
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214: /*MC
1215:   MATFFTW -  "fftw" - Matrix type that provides FFT via the FFTW external package.

1217:   Level: intermediate

1219: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatType`, `MatCreateFFT()`
1220: M*/
1221: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1222: {
1223:   MPI_Comm    comm;
1224:   Mat_FFT    *fft = (Mat_FFT *)A->data;
1225:   Mat_FFTW   *fftw;
1226:   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1227:   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1228:   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1229:   PetscBool   flg;
1230:   PetscInt    p_flag, partial_dim = 1, ctr;
1231:   PetscMPIInt size, rank;
1232:   ptrdiff_t  *pdim;
1233: #if !defined(PETSC_USE_COMPLEX)
1234:   PetscInt tot_dim;
1235: #endif

1237:   PetscFunctionBegin;
1238:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1239:   PetscCallMPI(MPI_Comm_size(comm, &size));
1240:   PetscCallMPI(MPI_Comm_rank(comm, &rank));

1242: #if !PetscDefined(HAVE_MPIUNI)
1243:   fftw_mpi_init();
1244: #endif
1245:   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1246:   pdim[0] = dim[0];
1247: #if !defined(PETSC_USE_COMPLEX)
1248:   tot_dim = 2 * dim[0];
1249: #endif
1250:   for (ctr = 1; ctr < ndim; ctr++) {
1251:     partial_dim *= dim[ctr];
1252:     pdim[ctr] = dim[ctr];
1253: #if !defined(PETSC_USE_COMPLEX)
1254:     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1255:     else tot_dim *= dim[ctr];
1256: #endif
1257:   }

1259:   if (size == 1) {
1260: #if defined(PETSC_USE_COMPLEX)
1261:     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1262:     fft->n = fft->N;
1263: #else
1264:     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1265:     fft->n = tot_dim;
1266: #endif
1267: #if !PetscDefined(HAVE_MPIUNI)
1268:   } else {
1269:     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1270:   #if !defined(PETSC_USE_COMPLEX)
1271:     ptrdiff_t temp;
1272:     PetscInt  N1;
1273:   #else
1274:     PetscInt n1;
1275:   #endif

1277:     switch (ndim) {
1278:     case 1:
1279:   #if !defined(PETSC_USE_COMPLEX)
1280:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1281:   #else
1282:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1283:       PetscCall(PetscIntCast(local_n0, &fft->n));
1284:       PetscCall(PetscIntCast(local_n1, &n1));
1285:       PetscCall(MatSetSizes(A, n1, fft->n, fft->N, fft->N));
1286:   #endif
1287:       break;
1288:     case 2:
1289:   #if defined(PETSC_USE_COMPLEX)
1290:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1291:       fft->n = (PetscInt)local_n0 * dim[1];
1292:       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1293:   #else
1294:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1296:       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1297:       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1298:   #endif
1299:       break;
1300:     case 3:
1301:   #if defined(PETSC_USE_COMPLEX)
1302:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

1304:       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1305:       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1306:   #else
1307:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1309:       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1310:       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1311:   #endif
1312:       break;
1313:     default:
1314:   #if defined(PETSC_USE_COMPLEX)
1315:       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);

1317:       PetscCall(PetscIntCast(local_n0 * partial_dim, &fft->n));
1318:       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1319:   #else
1320:       temp = pdim[ndim - 1];

1322:       pdim[ndim - 1] = temp / 2 + 1;

1324:       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1326:       PetscCall(PetscIntCast(2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp, &fft->n));
1327:       N1 = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);

1329:       pdim[ndim - 1] = temp;

1331:       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1332:   #endif
1333:       break;
1334:     }
1335: #endif
1336:   }
1337:   free(pdim);
1338:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1339:   PetscCall(PetscNew(&fftw));
1340:   fft->data = (void *)fftw;

1342:   fftw->ndim_fftw   = ndim; /* This is dimension of fft */
1343:   fftw->partial_dim = partial_dim;

1345:   PetscCall(PetscMalloc1(ndim, &fftw->dim_fftw));
1346:   if (size == 1) {
1347: #if defined(PETSC_USE_64BIT_INDICES)
1348:     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1349: #else
1350:     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1351: #endif
1352:   }

1354:   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];

1356:   fftw->p_forward  = NULL;
1357:   fftw->p_backward = NULL;
1358:   fftw->p_flag     = FFTW_ESTIMATE;

1360:   if (size == 1) {
1361:     A->ops->mult          = MatMult_SeqFFTW;
1362:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1363: #if !PetscDefined(HAVE_MPIUNI)
1364:   } else {
1365:     A->ops->mult          = MatMult_MPIFFTW;
1366:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1367: #endif
1368:   }
1369:   fft->matdestroy = MatDestroy_FFTW;
1370:   A->assembled    = PETSC_TRUE;
1371:   A->preallocated = PETSC_TRUE;

1373:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1374:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1375:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));

1377:   /* get runtime options */
1378:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1379:   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1380:   if (flg) fftw->p_flag = iplans[p_flag];
1381:   PetscOptionsEnd();
1382:   PetscFunctionReturn(PETSC_SUCCESS);
1383: }