Actual source code: fftw.c


  2: /*
  3:     Provides an interface to the FFTW package.
  4:     Testing examples can be found in ~src/mat/tests
  5: */

  7: #include <../src/mat/impls/fft/fft.h>
  8: EXTERN_C_BEGIN
  9: #if !PetscDefined(HAVE_MPIUNI)
 10:   #include <fftw3-mpi.h>
 11: #else
 12:   #include <fftw3.h>
 13: #endif
 14: EXTERN_C_END

 16: typedef struct {
 17:   ptrdiff_t ndim_fftw, *dim_fftw;
 18: #if defined(PETSC_USE_64BIT_INDICES)
 19:   fftw_iodim64 *iodims;
 20: #else
 21:   fftw_iodim *iodims;
 22: #endif
 23:   PetscInt     partial_dim;
 24:   fftw_plan    p_forward, p_backward;
 25:   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
 26:   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw plan should be
 27:                                                             executed for the arrays with which the plan was created */
 28: } Mat_FFTW;

 30: extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
 31: extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
 32: extern PetscErrorCode MatDestroy_FFTW(Mat);
 33: extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
 34: #if !PetscDefined(HAVE_MPIUNI)
 35: extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
 36: extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
 37: extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
 38: #endif

 40: /*
 41:    MatMult_SeqFFTW performs forward DFT
 42:    Input parameter:
 43:      A - the matrix
 44:      x - the vector on which FDFT will be performed

 46:    Output parameter:
 47:      y - vector that stores result of FDFT
 48: */
 49: PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
 50: {
 51:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
 52:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
 53:   const PetscScalar *x_array;
 54:   PetscScalar       *y_array;
 55: #if defined(PETSC_USE_COMPLEX)
 56:   #if defined(PETSC_USE_64BIT_INDICES)
 57:   fftw_iodim64 *iodims;
 58:   #else
 59:   fftw_iodim *iodims;
 60:   #endif
 61:   PetscInt i;
 62: #endif
 63:   PetscInt ndim = fft->ndim, *dim = fft->dim;

 65:   VecGetArrayRead(x, &x_array);
 66:   VecGetArray(y, &y_array);
 67:   if (!fftw->p_forward) { /* create a plan, then execute it */
 68:     switch (ndim) {
 69:     case 1:
 70: #if defined(PETSC_USE_COMPLEX)
 71:       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 72: #else
 73:       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 74: #endif
 75:       break;
 76:     case 2:
 77: #if defined(PETSC_USE_COMPLEX)
 78:       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 79: #else
 80:       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 81: #endif
 82:       break;
 83:     case 3:
 84: #if defined(PETSC_USE_COMPLEX)
 85:       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
 86: #else
 87:       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
 88: #endif
 89:       break;
 90:     default:
 91: #if defined(PETSC_USE_COMPLEX)
 92:       iodims = fftw->iodims;
 93:   #if defined(PETSC_USE_64BIT_INDICES)
 94:       if (ndim) {
 95:         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
 96:         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
 97:         for (i = ndim - 2; i >= 0; --i) {
 98:           iodims[i].n  = (ptrdiff_t)dim[i];
 99:           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
100:         }
101:       }
102:       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
103:   #else
104:       if (ndim) {
105:         iodims[ndim - 1].n  = (int)dim[ndim - 1];
106:         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
107:         for (i = ndim - 2; i >= 0; --i) {
108:           iodims[i].n  = (int)dim[i];
109:           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
110:         }
111:       }
112:       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
113:   #endif

115: #else
116:       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
117: #endif
118:       break;
119:     }
120:     fftw->finarray  = (PetscScalar *)x_array;
121:     fftw->foutarray = y_array;
122:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
123:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
124:     fftw_execute(fftw->p_forward);
125:   } else {                                                         /* use existing plan */
126:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
127: #if defined(PETSC_USE_COMPLEX)
128:       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
129: #else
130:       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
131: #endif
132:     } else {
133:       fftw_execute(fftw->p_forward);
134:     }
135:   }
136:   VecRestoreArray(y, &y_array);
137:   VecRestoreArrayRead(x, &x_array);
138:   return 0;
139: }

141: /* MatMultTranspose_SeqFFTW performs serial backward DFT
142:    Input parameter:
143:      A - the matrix
144:      x - the vector on which BDFT will be performed

146:    Output parameter:
147:      y - vector that stores result of BDFT
148: */

150: PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
151: {
152:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
153:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
154:   const PetscScalar *x_array;
155:   PetscScalar       *y_array;
156:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
157: #if defined(PETSC_USE_COMPLEX)
158:   #if defined(PETSC_USE_64BIT_INDICES)
159:   fftw_iodim64 *iodims = fftw->iodims;
160:   #else
161:   fftw_iodim *iodims = fftw->iodims;
162:   #endif
163: #endif

165:   VecGetArrayRead(x, &x_array);
166:   VecGetArray(y, &y_array);
167:   if (!fftw->p_backward) { /* create a plan, then execute it */
168:     switch (ndim) {
169:     case 1:
170: #if defined(PETSC_USE_COMPLEX)
171:       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
172: #else
173:       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
174: #endif
175:       break;
176:     case 2:
177: #if defined(PETSC_USE_COMPLEX)
178:       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179: #else
180:       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
181: #endif
182:       break;
183:     case 3:
184: #if defined(PETSC_USE_COMPLEX)
185:       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186: #else
187:       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
188: #endif
189:       break;
190:     default:
191: #if defined(PETSC_USE_COMPLEX)
192:   #if defined(PETSC_USE_64BIT_INDICES)
193:       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
194:   #else
195:       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196:   #endif
197: #else
198:       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
199: #endif
200:       break;
201:     }
202:     fftw->binarray  = (PetscScalar *)x_array;
203:     fftw->boutarray = y_array;
204:     fftw_execute(fftw->p_backward);
205:   } else {                                                         /* use existing plan */
206:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
207: #if defined(PETSC_USE_COMPLEX)
208:       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
209: #else
210:       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
211: #endif
212:     } else {
213:       fftw_execute(fftw->p_backward);
214:     }
215:   }
216:   VecRestoreArray(y, &y_array);
217:   VecRestoreArrayRead(x, &x_array);
218:   return 0;
219: }

221: #if !PetscDefined(HAVE_MPIUNI)
222: /* MatMult_MPIFFTW performs forward DFT in parallel
223:    Input parameter:
224:      A - the matrix
225:      x - the vector on which FDFT will be performed

227:    Output parameter:
228:    y   - vector that stores result of FDFT
229: */
230: PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
231: {
232:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
233:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
234:   const PetscScalar *x_array;
235:   PetscScalar       *y_array;
236:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
237:   MPI_Comm           comm;

239:   PetscObjectGetComm((PetscObject)A, &comm);
240:   VecGetArrayRead(x, &x_array);
241:   VecGetArray(y, &y_array);
242:   if (!fftw->p_forward) { /* create a plan, then execute it */
243:     switch (ndim) {
244:     case 1:
245:   #if defined(PETSC_USE_COMPLEX)
246:       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
247:   #else
248:       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
249:   #endif
250:       break;
251:     case 2:
252:   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
253:       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
254:   #else
255:       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
256:   #endif
257:       break;
258:     case 3:
259:   #if defined(PETSC_USE_COMPLEX)
260:       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
261:   #else
262:       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
263:   #endif
264:       break;
265:     default:
266:   #if defined(PETSC_USE_COMPLEX)
267:       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
268:   #else
269:       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
270:   #endif
271:       break;
272:     }
273:     fftw->finarray  = (PetscScalar *)x_array;
274:     fftw->foutarray = y_array;
275:     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
276:                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
277:     fftw_execute(fftw->p_forward);
278:   } else {                                                         /* use existing plan */
279:     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
280:       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
281:     } else {
282:       fftw_execute(fftw->p_forward);
283:     }
284:   }
285:   VecRestoreArray(y, &y_array);
286:   VecRestoreArrayRead(x, &x_array);
287:   return 0;
288: }

290: /*
291:    MatMultTranspose_MPIFFTW performs parallel backward DFT
292:    Input parameter:
293:      A - the matrix
294:      x - the vector on which BDFT will be performed

296:    Output parameter:
297:      y - vector that stores result of BDFT
298: */
299: PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
300: {
301:   Mat_FFT           *fft  = (Mat_FFT *)A->data;
302:   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
303:   const PetscScalar *x_array;
304:   PetscScalar       *y_array;
305:   PetscInt           ndim = fft->ndim, *dim = fft->dim;
306:   MPI_Comm           comm;

308:   PetscObjectGetComm((PetscObject)A, &comm);
309:   VecGetArrayRead(x, &x_array);
310:   VecGetArray(y, &y_array);
311:   if (!fftw->p_backward) { /* create a plan, then execute it */
312:     switch (ndim) {
313:     case 1:
314:   #if defined(PETSC_USE_COMPLEX)
315:       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
316:   #else
317:       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
318:   #endif
319:       break;
320:     case 2:
321:   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322:       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
323:   #else
324:       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
325:   #endif
326:       break;
327:     case 3:
328:   #if defined(PETSC_USE_COMPLEX)
329:       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
330:   #else
331:       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
332:   #endif
333:       break;
334:     default:
335:   #if defined(PETSC_USE_COMPLEX)
336:       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
337:   #else
338:       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
339:   #endif
340:       break;
341:     }
342:     fftw->binarray  = (PetscScalar *)x_array;
343:     fftw->boutarray = y_array;
344:     fftw_execute(fftw->p_backward);
345:   } else {                                                         /* use existing plan */
346:     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
347:       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
348:     } else {
349:       fftw_execute(fftw->p_backward);
350:     }
351:   }
352:   VecRestoreArray(y, &y_array);
353:   VecRestoreArrayRead(x, &x_array);
354:   return 0;
355: }
356: #endif

358: PetscErrorCode MatDestroy_FFTW(Mat A)
359: {
360:   Mat_FFT  *fft  = (Mat_FFT *)A->data;
361:   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;

363:   fftw_destroy_plan(fftw->p_forward);
364:   fftw_destroy_plan(fftw->p_backward);
365:   if (fftw->iodims) free(fftw->iodims);
366:   PetscFree(fftw->dim_fftw);
367:   PetscFree(fft->data);
368:   PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL);
369:   PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL);
370:   PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL);
371: #if !PetscDefined(HAVE_MPIUNI)
372:   fftw_mpi_cleanup();
373: #endif
374:   return 0;
375: }

377: #if !PetscDefined(HAVE_MPIUNI)
378: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
379: PetscErrorCode VecDestroy_MPIFFTW(Vec v)
380: {
381:   PetscScalar *array;

383:   VecGetArray(v, &array);
384:   fftw_free((fftw_complex *)array);
385:   VecRestoreArray(v, &array);
386:   VecDestroy_MPI(v);
387:   return 0;
388: }
389: #endif

391: #if !PetscDefined(HAVE_MPIUNI)
392: static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
393: {
394:   Mat A;

396:   PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A);
397:   MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL);
398:   return 0;
399: }

401: static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
402: {
403:   Mat A;

405:   PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A);
406:   MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL);
407:   return 0;
408: }

410: static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
411: {
412:   Mat A;

414:   PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A);
415:   MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new);
416:   return 0;
417: }
418: #endif

420: /*@
421:    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
422:      parallel layout determined by `MATFFTW`

424:    Collective

426:    Input Parameter:
427: .   A - the matrix

429:    Output Parameters:
430: +   x - (optional) input vector of forward FFTW
431: .   y - (optional) output vector of forward FFTW
432: -   z - (optional) output vector of backward FFTW

434:   Level: advanced

436:   Notes:
437:   The parallel layout of output of forward FFTW is always same as the input
438:   of the backward FFTW. But parallel layout of the input vector of forward
439:   FFTW might not be same as the output of backward FFTW.

441:   We need to provide enough space while doing parallel real transform.
442:   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
443:   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
444:   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
445:   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
446:   zeros if it is odd only one zero is needed.

448:   Lastly one needs some scratch space at the end of data set in each process. alloc_local
449:   figures out how much space is needed, i.e. it figures out the data+scratch space for
450:   each processor and returns that.

452:   Developer Note:
453:   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?

455: .seealso: `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
456: @*/
457: PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
458: {
459:   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
460:   return 0;
461: }

463: PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
464: {
465:   PetscMPIInt size, rank;
466:   MPI_Comm    comm;
467:   Mat_FFT    *fft = (Mat_FFT *)A->data;

471:   PetscObjectGetComm((PetscObject)A, &comm);

473:   MPI_Comm_size(comm, &size);
474:   MPI_Comm_rank(comm, &rank);
475:   if (size == 1) { /* sequential case */
476: #if defined(PETSC_USE_COMPLEX)
477:     if (fin) VecCreateSeq(PETSC_COMM_SELF, fft->N, fin);
478:     if (fout) VecCreateSeq(PETSC_COMM_SELF, fft->N, fout);
479:     if (bout) VecCreateSeq(PETSC_COMM_SELF, fft->N, bout);
480: #else
481:     if (fin) VecCreateSeq(PETSC_COMM_SELF, fft->n, fin);
482:     if (fout) VecCreateSeq(PETSC_COMM_SELF, fft->n, fout);
483:     if (bout) VecCreateSeq(PETSC_COMM_SELF, fft->n, bout);
484: #endif
485: #if !PetscDefined(HAVE_MPIUNI)
486:   } else { /* parallel cases */
487:     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
488:     PetscInt      ndim = fft->ndim, *dim = fft->dim;
489:     ptrdiff_t     alloc_local, local_n0, local_0_start;
490:     ptrdiff_t     local_n1;
491:     fftw_complex *data_fout;
492:     ptrdiff_t     local_1_start;
493:   #if defined(PETSC_USE_COMPLEX)
494:     fftw_complex *data_fin, *data_bout;
495:   #else
496:     double   *data_finr, *data_boutr;
497:     PetscInt  n1, N1;
498:     ptrdiff_t temp;
499:   #endif

501:     switch (ndim) {
502:     case 1:
503:   #if !defined(PETSC_USE_COMPLEX)
504:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
505:   #else
506:       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
507:       if (fin) {
508:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
509:         VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin);
510:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
511:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
512:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
513:       }
514:       if (fout) {
515:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
516:         VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout);
517:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
518:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
519:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
520:       }
521:       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
522:       if (bout) {
523:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
524:         VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout);
525:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
526:         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
527:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
528:       }
529:       break;
530:   #endif
531:     case 2:
532:   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
533:       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
534:       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
535:       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
536:       if (fin) {
537:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
538:         VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin);
539:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
540:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
541:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
542:       }
543:       if (fout) {
544:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
545:         VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout);
546:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
547:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
548:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
549:       }
550:       if (bout) {
551:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
552:         VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout);
553:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
554:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
555:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
556:       }
557:   #else
558:       /* Get local size */
559:       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
560:       if (fin) {
561:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
562:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin);
563:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
564:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
565:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
566:       }
567:       if (fout) {
568:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
569:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout);
570:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
571:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
572:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
573:       }
574:       if (bout) {
575:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
576:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout);
577:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
578:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
579:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
580:       }
581:   #endif
582:       break;
583:     case 3:
584:   #if !defined(PETSC_USE_COMPLEX)
585:       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
586:       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
587:       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
588:       if (fin) {
589:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
590:         VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin);
591:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
592:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
593:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
594:       }
595:       if (fout) {
596:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
597:         VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout);
598:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
599:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
600:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
601:       }
602:       if (bout) {
603:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
604:         VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout);
605:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
606:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
607:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
608:       }
609:   #else
610:       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
611:       if (fin) {
612:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
613:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin);
614:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
615:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
616:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
617:       }
618:       if (fout) {
619:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
620:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout);
621:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
622:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
623:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
624:       }
625:       if (bout) {
626:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
627:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout);
628:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
629:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
630:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
631:       }
632:   #endif
633:       break;
634:     default:
635:   #if !defined(PETSC_USE_COMPLEX)
636:       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
637:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
638:       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
639:       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
640:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

642:       if (fin) {
643:         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
644:         VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin);
645:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
646:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
647:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
648:       }
649:       if (fout) {
650:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
651:         VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout);
652:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
653:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
654:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
655:       }
656:       if (bout) {
657:         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
658:         VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout);
659:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
660:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
661:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
662:       }
663:   #else
664:       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
665:       if (fin) {
666:         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
667:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin);
668:         PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A);
669:         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
670:         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
671:       }
672:       if (fout) {
673:         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
674:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout);
675:         PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A);
676:         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
677:         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
678:       }
679:       if (bout) {
680:         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
681:         VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout);
682:         PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A);
683:         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
684:         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
685:       }
686:   #endif
687:       break;
688:     }
689:     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
690:        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
691:        memory leaks. We void these pointers here to avoid accident uses.
692:     */
693:     if (fin) (*fin)->ops->replacearray = NULL;
694:     if (fout) (*fout)->ops->replacearray = NULL;
695:     if (bout) (*bout)->ops->replacearray = NULL;
696: #endif
697:   }
698:   return 0;
699: }

701: /*@
702:    VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.

704:    Collective

706:    Input Parameters:
707: +  A - FFTW matrix
708: -  x - the PETSc vector

710:    Output Parameters:
711: .  y - the FFTW vector

713:    Level: intermediate

715:    Note:
716:    For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
717:    one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
718:    zeros. This routine does that job by scattering operation.

720: .seealso: `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
721: @*/
722: PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
723: {
724:   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
725:   return 0;
726: }

728: PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
729: {
730:   MPI_Comm    comm;
731:   Mat_FFT    *fft = (Mat_FFT *)A->data;
732:   PetscInt    low;
733:   PetscMPIInt rank, size;
734:   PetscInt    vsize, vsize1;
735:   VecScatter  vecscat;
736:   IS          list1;

738:   PetscObjectGetComm((PetscObject)A, &comm);
739:   MPI_Comm_size(comm, &size);
740:   MPI_Comm_rank(comm, &rank);
741:   VecGetOwnershipRange(y, &low, NULL);

743:   if (size == 1) {
744:     VecGetSize(x, &vsize);
745:     VecGetSize(y, &vsize1);
746:     ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1);
747:     VecScatterCreate(x, list1, y, list1, &vecscat);
748:     VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
749:     VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
750:     VecScatterDestroy(&vecscat);
751:     ISDestroy(&list1);
752: #if !PetscDefined(HAVE_MPIUNI)
753:   } else {
754:     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
755:     PetscInt  ndim = fft->ndim, *dim = fft->dim;
756:     ptrdiff_t local_n0, local_0_start;
757:     ptrdiff_t local_n1, local_1_start;
758:     IS        list2;
759:   #if !defined(PETSC_USE_COMPLEX)
760:     PetscInt  i, j, k, partial_dim;
761:     PetscInt *indx1, *indx2, tempindx, tempindx1;
762:     PetscInt  NM;
763:     ptrdiff_t temp;
764:   #endif

766:     switch (ndim) {
767:     case 1:
768:   #if defined(PETSC_USE_COMPLEX)
769:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);

771:       ISCreateStride(comm, local_n0, local_0_start, 1, &list1);
772:       ISCreateStride(comm, local_n0, low, 1, &list2);
773:       VecScatterCreate(x, list1, y, list2, &vecscat);
774:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
775:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
776:       VecScatterDestroy(&vecscat);
777:       ISDestroy(&list1);
778:       ISDestroy(&list2);
779:   #else
780:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
781:   #endif
782:       break;
783:     case 2:
784:   #if defined(PETSC_USE_COMPLEX)
785:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);

787:       ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1);
788:       ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2);
789:       VecScatterCreate(x, list1, y, list2, &vecscat);
790:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
791:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
792:       VecScatterDestroy(&vecscat);
793:       ISDestroy(&list1);
794:       ISDestroy(&list2);
795:   #else
796:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

798:       PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1);
799:       PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2);

801:       if (dim[1] % 2 == 0) {
802:         NM = dim[1] + 2;
803:       } else {
804:         NM = dim[1] + 1;
805:       }
806:       for (i = 0; i < local_n0; i++) {
807:         for (j = 0; j < dim[1]; j++) {
808:           tempindx  = i * dim[1] + j;
809:           tempindx1 = i * NM + j;

811:           indx1[tempindx] = local_0_start * dim[1] + tempindx;
812:           indx2[tempindx] = low + tempindx1;
813:         }
814:       }

816:       ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1);
817:       ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2);

819:       VecScatterCreate(x, list1, y, list2, &vecscat);
820:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
821:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
822:       VecScatterDestroy(&vecscat);
823:       ISDestroy(&list1);
824:       ISDestroy(&list2);
825:       PetscFree(indx1);
826:       PetscFree(indx2);
827:   #endif
828:       break;

830:     case 3:
831:   #if defined(PETSC_USE_COMPLEX)
832:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

834:       ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1);
835:       ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2);
836:       VecScatterCreate(x, list1, y, list2, &vecscat);
837:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
838:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
839:       VecScatterDestroy(&vecscat);
840:       ISDestroy(&list1);
841:       ISDestroy(&list2);
842:   #else
843:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
844:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
845:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

847:       PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1);
848:       PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2);

850:       if (dim[2] % 2 == 0) NM = dim[2] + 2;
851:       else NM = dim[2] + 1;

853:       for (i = 0; i < local_n0; i++) {
854:         for (j = 0; j < dim[1]; j++) {
855:           for (k = 0; k < dim[2]; k++) {
856:             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
857:             tempindx1 = i * dim[1] * NM + j * NM + k;

859:             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
860:             indx2[tempindx] = low + tempindx1;
861:           }
862:         }
863:       }

865:       ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1);
866:       ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2);
867:       VecScatterCreate(x, list1, y, list2, &vecscat);
868:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
869:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
870:       VecScatterDestroy(&vecscat);
871:       ISDestroy(&list1);
872:       ISDestroy(&list2);
873:       PetscFree(indx1);
874:       PetscFree(indx2);
875:   #endif
876:       break;

878:     default:
879:   #if defined(PETSC_USE_COMPLEX)
880:       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);

882:       ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1);
883:       ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2);
884:       VecScatterCreate(x, list1, y, list2, &vecscat);
885:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
886:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
887:       VecScatterDestroy(&vecscat);
888:       ISDestroy(&list1);
889:       ISDestroy(&list2);
890:   #else
891:       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
892:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
893:       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];

895:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;

897:       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

899:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

901:       partial_dim = fftw->partial_dim;

903:       PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1);
904:       PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2);

906:       if (dim[ndim - 1] % 2 == 0) NM = 2;
907:       else NM = 1;

909:       j = low;
910:       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
911:         indx1[i] = local_0_start * partial_dim + i;
912:         indx2[i] = j;
913:         if (k % dim[ndim - 1] == 0) j += NM;
914:         j++;
915:       }
916:       ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1);
917:       ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2);
918:       VecScatterCreate(x, list1, y, list2, &vecscat);
919:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
920:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
921:       VecScatterDestroy(&vecscat);
922:       ISDestroy(&list1);
923:       ISDestroy(&list2);
924:       PetscFree(indx1);
925:       PetscFree(indx2);
926:   #endif
927:       break;
928:     }
929: #endif
930:   }
931:   return 0;
932: }

934: /*@
935:    VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.

937:    Collective

939:     Input Parameters:
940: +   A - `MATFFTW` matrix
941: -   x - FFTW vector

943:    Output Parameters:
944: .  y - PETSc vector

946:    Level: intermediate

948:    Note:
949:    While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
950:    `VecScatterFFTWToPetsc()` removes those extra zeros.

952: .seealso: `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
953: @*/
954: PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
955: {
956:   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
957:   return 0;
958: }

960: PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
961: {
962:   MPI_Comm    comm;
963:   Mat_FFT    *fft = (Mat_FFT *)A->data;
964:   PetscInt    low;
965:   PetscMPIInt rank, size;
966:   VecScatter  vecscat;
967:   IS          list1;

969:   PetscObjectGetComm((PetscObject)A, &comm);
970:   MPI_Comm_size(comm, &size);
971:   MPI_Comm_rank(comm, &rank);
972:   VecGetOwnershipRange(x, &low, NULL);

974:   if (size == 1) {
975:     ISCreateStride(comm, fft->N, 0, 1, &list1);
976:     VecScatterCreate(x, list1, y, list1, &vecscat);
977:     VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
978:     VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
979:     VecScatterDestroy(&vecscat);
980:     ISDestroy(&list1);

982: #if !PetscDefined(HAVE_MPIUNI)
983:   } else {
984:     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
985:     PetscInt  ndim = fft->ndim, *dim = fft->dim;
986:     ptrdiff_t local_n0, local_0_start;
987:     ptrdiff_t local_n1, local_1_start;
988:     IS        list2;
989:   #if !defined(PETSC_USE_COMPLEX)
990:     PetscInt  i, j, k, partial_dim;
991:     PetscInt *indx1, *indx2, tempindx, tempindx1;
992:     PetscInt  NM;
993:     ptrdiff_t temp;
994:   #endif
995:     switch (ndim) {
996:     case 1:
997:   #if defined(PETSC_USE_COMPLEX)
998:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);

1000:       ISCreateStride(comm, local_n1, local_1_start, 1, &list1);
1001:       ISCreateStride(comm, local_n1, low, 1, &list2);
1002:       VecScatterCreate(x, list1, y, list2, &vecscat);
1003:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1004:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1005:       VecScatterDestroy(&vecscat);
1006:       ISDestroy(&list1);
1007:       ISDestroy(&list2);
1008:   #else
1009:       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1010:   #endif
1011:       break;
1012:     case 2:
1013:   #if defined(PETSC_USE_COMPLEX)
1014:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);

1016:       ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1);
1017:       ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2);
1018:       VecScatterCreate(x, list2, y, list1, &vecscat);
1019:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1020:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1021:       VecScatterDestroy(&vecscat);
1022:       ISDestroy(&list1);
1023:       ISDestroy(&list2);
1024:   #else
1025:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1027:       PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1);
1028:       PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2);

1030:       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1031:       else NM = dim[1] + 1;

1033:       for (i = 0; i < local_n0; i++) {
1034:         for (j = 0; j < dim[1]; j++) {
1035:           tempindx  = i * dim[1] + j;
1036:           tempindx1 = i * NM + j;

1038:           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1039:           indx2[tempindx] = low + tempindx1;
1040:         }
1041:       }

1043:       ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1);
1044:       ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2);

1046:       VecScatterCreate(x, list2, y, list1, &vecscat);
1047:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1048:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1049:       VecScatterDestroy(&vecscat);
1050:       ISDestroy(&list1);
1051:       ISDestroy(&list2);
1052:       PetscFree(indx1);
1053:       PetscFree(indx2);
1054:   #endif
1055:       break;
1056:     case 3:
1057:   #if defined(PETSC_USE_COMPLEX)
1058:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

1060:       ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1);
1061:       ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2);
1062:       VecScatterCreate(x, list1, y, list2, &vecscat);
1063:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1064:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1065:       VecScatterDestroy(&vecscat);
1066:       ISDestroy(&list1);
1067:       ISDestroy(&list2);
1068:   #else
1069:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1071:       PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1);
1072:       PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2);

1074:       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1075:       else NM = dim[2] + 1;

1077:       for (i = 0; i < local_n0; i++) {
1078:         for (j = 0; j < dim[1]; j++) {
1079:           for (k = 0; k < dim[2]; k++) {
1080:             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1081:             tempindx1 = i * dim[1] * NM + j * NM + k;

1083:             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1084:             indx2[tempindx] = low + tempindx1;
1085:           }
1086:         }
1087:       }

1089:       ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1);
1090:       ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2);

1092:       VecScatterCreate(x, list2, y, list1, &vecscat);
1093:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1094:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1095:       VecScatterDestroy(&vecscat);
1096:       ISDestroy(&list1);
1097:       ISDestroy(&list2);
1098:       PetscFree(indx1);
1099:       PetscFree(indx2);
1100:   #endif
1101:       break;
1102:     default:
1103:   #if defined(PETSC_USE_COMPLEX)
1104:       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);

1106:       ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1);
1107:       ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2);
1108:       VecScatterCreate(x, list1, y, list2, &vecscat);
1109:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1110:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1111:       VecScatterDestroy(&vecscat);
1112:       ISDestroy(&list1);
1113:       ISDestroy(&list2);
1114:   #else
1115:       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];

1117:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;

1119:       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1121:       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;

1123:       partial_dim = fftw->partial_dim;

1125:       PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1);
1126:       PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2);

1128:       if (dim[ndim - 1] % 2 == 0) NM = 2;
1129:       else NM = 1;

1131:       j = low;
1132:       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1133:         indx1[i] = local_0_start * partial_dim + i;
1134:         indx2[i] = j;
1135:         if (k % dim[ndim - 1] == 0) j += NM;
1136:         j++;
1137:       }
1138:       ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1);
1139:       ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2);

1141:       VecScatterCreate(x, list2, y, list1, &vecscat);
1142:       VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1143:       VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD);
1144:       VecScatterDestroy(&vecscat);
1145:       ISDestroy(&list1);
1146:       ISDestroy(&list2);
1147:       PetscFree(indx1);
1148:       PetscFree(indx2);
1149:   #endif
1150:       break;
1151:     }
1152: #endif
1153:   }
1154:   return 0;
1155: }

1157: /*
1158:     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW

1160:   Options Database Keys:
1161: + -mat_fftw_plannerflags - set FFTW planner flags

1163:    Level: intermediate

1165: */
1166: PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1167: {
1168:   MPI_Comm    comm;
1169:   Mat_FFT    *fft = (Mat_FFT *)A->data;
1170:   Mat_FFTW   *fftw;
1171:   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1172:   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1173:   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1174:   PetscBool   flg;
1175:   PetscInt    p_flag, partial_dim = 1, ctr;
1176:   PetscMPIInt size, rank;
1177:   ptrdiff_t  *pdim;
1178: #if !defined(PETSC_USE_COMPLEX)
1179:   PetscInt tot_dim;
1180: #endif

1182:   PetscObjectGetComm((PetscObject)A, &comm);
1183:   MPI_Comm_size(comm, &size);
1184:   MPI_Comm_rank(comm, &rank);

1186: #if !PetscDefined(HAVE_MPIUNI)
1187:   fftw_mpi_init();
1188: #endif
1189:   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1190:   pdim[0] = dim[0];
1191: #if !defined(PETSC_USE_COMPLEX)
1192:   tot_dim = 2 * dim[0];
1193: #endif
1194:   for (ctr = 1; ctr < ndim; ctr++) {
1195:     partial_dim *= dim[ctr];
1196:     pdim[ctr] = dim[ctr];
1197: #if !defined(PETSC_USE_COMPLEX)
1198:     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1199:     else tot_dim *= dim[ctr];
1200: #endif
1201:   }

1203:   if (size == 1) {
1204: #if defined(PETSC_USE_COMPLEX)
1205:     MatSetSizes(A, fft->N, fft->N, fft->N, fft->N);
1206:     fft->n = fft->N;
1207: #else
1208:     MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim);
1209:     fft->n       = tot_dim;
1210: #endif
1211: #if !PetscDefined(HAVE_MPIUNI)
1212:   } else {
1213:     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1214:   #if !defined(PETSC_USE_COMPLEX)
1215:     ptrdiff_t temp;
1216:     PetscInt  N1;
1217:   #endif

1219:     switch (ndim) {
1220:     case 1:
1221:   #if !defined(PETSC_USE_COMPLEX)
1222:       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1223:   #else
1224:       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1225:       fft->n = (PetscInt)local_n0;
1226:       MatSetSizes(A, local_n1, fft->n, fft->N, fft->N);
1227:   #endif
1228:       break;
1229:     case 2:
1230:   #if defined(PETSC_USE_COMPLEX)
1231:       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1232:       fft->n = (PetscInt)local_n0 * dim[1];
1233:       MatSetSizes(A, fft->n, fft->n, fft->N, fft->N);
1234:   #else
1235:       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1237:       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1238:       MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1));
1239:   #endif
1240:       break;
1241:     case 3:
1242:   #if defined(PETSC_USE_COMPLEX)
1243:       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);

1245:       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1246:       MatSetSizes(A, fft->n, fft->n, fft->N, fft->N);
1247:   #else
1248:       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1250:       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1251:       MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1));
1252:   #endif
1253:       break;
1254:     default:
1255:   #if defined(PETSC_USE_COMPLEX)
1256:       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);

1258:       fft->n = (PetscInt)local_n0 * partial_dim;
1259:       MatSetSizes(A, fft->n, fft->n, fft->N, fft->N);
1260:   #else
1261:       temp = pdim[ndim - 1];

1263:       pdim[ndim - 1] = temp / 2 + 1;

1265:       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);

1267:       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1268:       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);

1270:       pdim[ndim - 1] = temp;

1272:       MatSetSizes(A, fft->n, fft->n, N1, N1);
1273:   #endif
1274:       break;
1275:     }
1276: #endif
1277:   }
1278:   free(pdim);
1279:   PetscObjectChangeTypeName((PetscObject)A, MATFFTW);
1280:   PetscNew(&fftw);
1281:   fft->data = (void *)fftw;

1283:   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1284:   fftw->partial_dim = partial_dim;

1286:   PetscMalloc1(ndim, &(fftw->dim_fftw));
1287:   if (size == 1) {
1288: #if defined(PETSC_USE_64BIT_INDICES)
1289:     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1290: #else
1291:     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1292: #endif
1293:   }

1295:   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];

1297:   fftw->p_forward  = NULL;
1298:   fftw->p_backward = NULL;
1299:   fftw->p_flag     = FFTW_ESTIMATE;

1301:   if (size == 1) {
1302:     A->ops->mult          = MatMult_SeqFFTW;
1303:     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1304: #if !PetscDefined(HAVE_MPIUNI)
1305:   } else {
1306:     A->ops->mult          = MatMult_MPIFFTW;
1307:     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1308: #endif
1309:   }
1310:   fft->matdestroy = MatDestroy_FFTW;
1311:   A->assembled    = PETSC_TRUE;
1312:   A->preallocated = PETSC_TRUE;

1314:   PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW);
1315:   PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW);
1316:   PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW);

1318:   /* get runtime options */
1319:   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1320:   PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg);
1321:   if (flg) fftw->p_flag = iplans[p_flag];
1322:   PetscOptionsEnd();
1323:   return 0;
1324: }