Actual source code: bench_pcsetup.c
1: /*
2: Poisson in 3D. Modeled by the PDE:
4: - delta u(x,y,z) = f(x,y,z)
6: With exact solution:
8: u(x,y,z) = 1.0
10: Exampe usage:
12: Run on GPU (requires respective backends installed):
13: ./bench_pcsetup -vec_type cuda -mat_type aijcusparse
14: ./bench_pcsetup -vec_type hip -mat_type aijhipsparse
15: ./bench_pcsetup -vec_type kokkos -mat_type aijkokkos
17: Test only MatMult:
18: ./bench_pcsetup -matmult
20: Test MatMult over 1000 iterations:
21: ./bench_pcsetup -matmult -its 1000
23: Change size of problem (e.g., use a 128x128x128 grid):
24: ./bench_pcsetup -n 128
25: */
26: static char help[] = "Solves 3D Laplacian with 27-point finite difference stencil.\n";
28: #include <petscksp.h>
30: typedef struct {
31: PetscMPIInt rank, size;
32: PetscInt n; /* global size in each dimension */
33: PetscInt dim; /* global size */
34: PetscInt nnz; /* local nonzeros */
35: PetscBool matmult; /* Do MatMult() only */
36: PetscBool debug; /* Debug flag for PreallocateCOO() */
37: PetscInt its; /* No of matmult_iterations */
38: } AppCtx;
40: static PetscErrorCode PreallocateCOO(Mat A, void *ctx)
41: {
42: AppCtx *user = (AppCtx *)ctx;
43: PetscInt Istart, Iend, n = user->n, n2 = n * n, n1 = n - 1;
44: PetscInt xstart, ystart, zstart, xend, yend, zend, nm2 = n - 2, idx;
45: PetscInt nnz[] = {8, 12, 18, 27}; /* nnz for corner, edge, face, and center. */
48: MatGetOwnershipRange(A, &Istart, &Iend);
49: xstart = Istart % n;
50: ystart = (Istart / n) % n;
51: zstart = Istart / n2;
52: xend = (Iend - 1) % n;
53: yend = ((Iend - 1) / n) % n;
54: zend = (Iend - 1) / n2;
56: /* bottom xy-plane */
57: user->nnz = 0;
58: idx = !zstart ? 0 : 1;
59: if (zstart == zend && (!zstart || zstart == n1)) idx = 0;
60: if (zstart == zend && (zstart && zstart < n1)) idx = 1;
61: if (!xstart && !ystart) // bottom left
62: user->nnz += 4 * nnz[idx] + 4 * nm2 * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
63: else if (xstart && xstart < n1 && !ystart) // bottom
64: user->nnz += 3 * nnz[idx] + (3 * nm2 + n1 - xstart) * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
65: else if (xstart == n1 && !ystart) // bottom right
66: user->nnz += 3 * nnz[idx] + (3 * nm2) * nnz[idx + 1] + nm2 * nm2 * nnz[idx + 2];
67: else if (!xstart && ystart && ystart < n1) // left
68: user->nnz += 2 * nnz[idx] + (nm2 + 2 * (n1 - ystart)) * nnz[idx + 1] + nm2 * (n1 - ystart) * nnz[idx + 2];
69: else if (xstart && xstart < n1 && ystart && ystart < n1) // center
70: user->nnz += 2 * nnz[idx] + (nm2 + (n1 - ystart) + (nm2 - ystart)) * nnz[idx + 1] + (nm2 * (nm2 - ystart) + (n1 - xstart)) * nnz[idx + 2];
71: else if (xstart == n1 && ystart && ystart < n1) // right
72: user->nnz += 2 * nnz[idx] + (nm2 + n1 - ystart + nm2 - ystart) * nnz[idx + 1] + nm2 * (nm2 - ystart) * nnz[idx + 2];
73: else if (ystart == n1 && !xstart) // top left
74: user->nnz += 2 * nnz[idx] + nm2 * nnz[idx + 1];
75: else if (ystart == n1 && xstart && xstart < n1) // top
76: user->nnz += nnz[idx] + (n1 - xstart) * nnz[idx + 1];
77: else if (xstart == n1 && ystart == n1) // top right
78: user->nnz += nnz[idx];
80: /* top and middle plane the same */
81: if (zend == zstart) user->nnz = user->nnz - 4 * nnz[idx] - 4 * nm2 * nnz[idx + 1] - nm2 * nm2 * nnz[idx + 2];
83: /* middle xy-planes */
84: if (zend - zstart > 1) user->nnz = user->nnz + (zend - zstart - 1) * (4 * nnz[1] + 4 * nnz[2] * nm2 + nnz[3] * nm2 * nm2);
86: /* top xy-plane */
87: idx = zend == n1 ? 0 : 1;
88: if (zstart == zend && (!zend || zend == n1)) idx = 0;
89: if (zstart == zend && (zend && zend < n1)) idx = 1;
90: if (!xend && !yend) // bottom left
91: user->nnz += nnz[idx];
92: else if (xend && xend < n1 && !yend) // bottom
93: user->nnz += nnz[idx] + xend * nnz[idx + 1];
94: else if (xend == n1 && !yend) // bottom right
95: user->nnz += 2 * nnz[idx] + nm2 * nnz[idx + 1];
96: else if (!xend && yend && yend < n1) // left
97: user->nnz += 2 * nnz[idx] + (nm2 + yend + yend - 1) * nnz[idx + 1] + nm2 * (yend - 1) * nnz[idx + 2];
98: else if (xend && xend < n1 && yend && yend < n1) // center
99: user->nnz += 2 * nnz[idx] + (nm2 + yend + yend - 1) * nnz[idx + 1] + (nm2 * (yend - 1) + xend) * nnz[idx + 2];
100: else if (xend == n1 && yend && yend < n1) // right
101: user->nnz += 2 * nnz[idx] + (nm2 + 2 * yend) * nnz[idx + 1] + (nm2 * yend) * nnz[idx + 2];
102: else if (!xend && yend == n1) // top left
103: user->nnz += 3 * nnz[idx] + (3 * nm2) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
104: else if (xend && xend < n1 && yend == n1) // top
105: user->nnz += 3 * nnz[idx] + (3 * nm2 + xend) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
106: else if (xend == n1 && yend == n1) // top right
107: user->nnz += 4 * nnz[idx] + (4 * nm2) * nnz[idx + 1] + (nm2 * nm2) * nnz[idx + 2];
108: if (user->debug)
109: PetscCall(PetscPrintf(PETSC_COMM_SELF, "rank %d: xyzstart = %" PetscInt_FMT ",%" PetscInt_FMT ",%" PetscInt_FMT ", xvzend = %" PetscInt_FMT ",%" PetscInt_FMT ",%" PetscInt_FMT ", nnz = %" PetscInt_FMT "\n", user->rank, xstart, ystart, zstart, xend, yend, zend,
110: user->nnz));
111: return 0;
112: }
114: PetscErrorCode FillCOO(Mat A, void *ctx)
115: {
116: AppCtx *user = (AppCtx *)ctx;
117: PetscInt Ii, Istart, Iend, x, y, z, n = user->n, n2 = n * n, n1 = n - 1;
118: PetscInt count = 0;
119: PetscInt *coo_i, *coo_j;
120: PetscScalar *coo_v;
121: PetscScalar h = 1.0 / (n - 1);
122: PetscScalar vcorn = -1.0 / 13 * h; //-1.0/12.0*h;
123: PetscScalar vedge = -3.0 / 26 * h; //-1.0/6.0*h;
124: PetscScalar vface = -3.0 / 13 * h; //-1.0e-9*h;
125: PetscScalar vcent = 44.0 / 13 * h; //8.0/3.0*h;
128: PetscCalloc3(user->nnz, &coo_i, user->nnz, &coo_j, user->nnz, &coo_v);
129: MatGetOwnershipRange(A, &Istart, &Iend);
130: /* Fill COO arrays */
131: for (Ii = Istart; Ii < Iend; Ii++) {
132: x = Ii % n;
133: y = (Ii / n) % n;
134: z = Ii / n2;
135: if (x > 0 && y > 0 && z > 0) {
136: coo_i[count] = Ii;
137: coo_j[count] = Ii - 1 - n - n2;
138: coo_v[count] = vcorn;
139: count++;
140: }
141: if (x > 0 && y < n1 && z > 0) {
142: coo_i[count] = Ii;
143: coo_j[count] = Ii - 1 + n - n2;
144: coo_v[count] = vcorn;
145: count++;
146: }
147: if (x < n1 && y > 0 && z > 0) {
148: coo_i[count] = Ii;
149: coo_j[count] = Ii + 1 - n - n2;
150: coo_v[count] = vcorn;
151: count++;
152: }
153: if (x < n1 && y < n1 && z > 0) {
154: coo_i[count] = Ii;
155: coo_j[count] = Ii + 1 + n - n2;
156: coo_v[count] = vcorn;
157: count++;
158: }
159: if (x > 0 && y > 0 && z < n1) {
160: coo_i[count] = Ii;
161: coo_j[count] = Ii - 1 - n + n2;
162: coo_v[count] = vcorn;
163: count++;
164: }
165: if (x > 0 && y < n1 && z < n1) {
166: coo_i[count] = Ii;
167: coo_j[count] = Ii - 1 + n + n2;
168: coo_v[count] = vcorn;
169: count++;
170: }
171: if (x < n1 && y > 0 && z < n1) {
172: coo_i[count] = Ii;
173: coo_j[count] = Ii + 1 - n + n2;
174: coo_v[count] = vcorn;
175: count++;
176: }
177: if (x < n1 && y < n1 && z < n1) {
178: coo_i[count] = Ii;
179: coo_j[count] = Ii + 1 + n + n2;
180: coo_v[count] = vcorn;
181: count++;
182: }
183: /* 12 edges */
184: if (x > 0 && y > 0) {
185: coo_i[count] = Ii;
186: coo_j[count] = Ii - 1 - n;
187: coo_v[count] = vedge;
188: count++;
189: }
190: if (x > 0 && y < n1) {
191: coo_i[count] = Ii;
192: coo_j[count] = Ii - 1 + n;
193: coo_v[count] = vedge;
194: count++;
195: }
196: if (x < n1 && y > 0) {
197: coo_i[count] = Ii;
198: coo_j[count] = Ii + 1 - n;
199: coo_v[count] = vedge;
200: count++;
201: }
202: if (x < n1 && y < n1) {
203: coo_i[count] = Ii;
204: coo_j[count] = Ii + 1 + n;
205: coo_v[count] = vedge;
206: count++;
207: }
208: if (x > 0 && z > 0) {
209: coo_i[count] = Ii;
210: coo_j[count] = Ii - 1 - n2;
211: coo_v[count] = vedge;
212: count++;
213: }
214: if (x > 0 && z < n1) {
215: coo_i[count] = Ii;
216: coo_j[count] = Ii - 1 + n2;
217: coo_v[count] = vedge;
218: count++;
219: }
220: if (x < n1 && z > 0) {
221: coo_i[count] = Ii;
222: coo_j[count] = Ii + 1 - n2;
223: coo_v[count] = vedge;
224: count++;
225: }
226: if (x < n1 && z < n1) {
227: coo_i[count] = Ii;
228: coo_j[count] = Ii + 1 + n2;
229: coo_v[count] = vedge;
230: count++;
231: }
232: if (y > 0 && z > 0) {
233: coo_i[count] = Ii;
234: coo_j[count] = Ii - n - n2;
235: coo_v[count] = vedge;
236: count++;
237: }
238: if (y > 0 && z < n1) {
239: coo_i[count] = Ii;
240: coo_j[count] = Ii - n + n2;
241: coo_v[count] = vedge;
242: count++;
243: }
244: if (y < n1 && z > 0) {
245: coo_i[count] = Ii;
246: coo_j[count] = Ii + n - n2;
247: coo_v[count] = vedge;
248: count++;
249: }
250: if (y < n1 && z < n1) {
251: coo_i[count] = Ii;
252: coo_j[count] = Ii + n + n2;
253: coo_v[count] = vedge;
254: count++;
255: }
256: /* 6 faces */
257: if (x > 0) {
258: coo_i[count] = Ii;
259: coo_j[count] = Ii - 1;
260: coo_v[count] = vface;
261: count++;
262: }
263: if (x < n1) {
264: coo_i[count] = Ii;
265: coo_j[count] = Ii + 1;
266: coo_v[count] = vface;
267: count++;
268: }
269: if (y > 0) {
270: coo_i[count] = Ii;
271: coo_j[count] = Ii - n;
272: coo_v[count] = vface;
273: count++;
274: }
275: if (y < n1) {
276: coo_i[count] = Ii;
277: coo_j[count] = Ii + n;
278: coo_v[count] = vface;
279: count++;
280: }
281: if (z > 0) {
282: coo_i[count] = Ii;
283: coo_j[count] = Ii - n2;
284: coo_v[count] = vface;
285: count++;
286: }
287: if (z < n1) {
288: coo_i[count] = Ii;
289: coo_j[count] = Ii + n2;
290: coo_v[count] = vface;
291: count++;
292: }
293: /* Center */
294: coo_i[count] = Ii;
295: coo_j[count] = Ii;
296: coo_v[count] = vcent;
297: count++;
298: }
300: MatSetPreallocationCOO(A, user->nnz, coo_i, coo_j);
301: MatSetValuesCOO(A, coo_v, INSERT_VALUES);
302: PetscFree3(coo_i, coo_j, coo_v);
303: return 0;
304: }
306: int main(int argc, char **argv)
307: {
308: AppCtx user; /* Application context */
309: Vec x, b, u; /* approx solution, RHS, and exact solution */
310: Mat A; /* linear system matrix */
311: KSP ksp; /* linear solver context */
312: PC pc; /* Preconditioner */
313: PetscReal norm; /* Error norm */
314: PetscInt ksp_its; /* Number of KSP iterations */
315: PetscInt global_nnz = 0; /* Total number of nonzeros */
316: PetscLogDouble time_start, time_mid1, time_mid2, time_end, time_avg, floprate;
317: PetscBool printTiming = PETSC_TRUE; /* If run in CI, do not print timing result */
318: PETSC_UNUSED PetscLogStage stage;
320: PetscInitialize(&argc, &argv, (char *)0, help);
321: MPI_Comm_size(PETSC_COMM_WORLD, &user.size);
322: MPI_Comm_rank(PETSC_COMM_WORLD, &user.rank);
324: user.n = 100; /* Default grid points per dimension */
325: user.matmult = PETSC_FALSE; /* Test MatMult only */
326: user.its = 100; /* Default no. of iterations for MatMult test */
327: user.debug = PETSC_FALSE; /* Debug PreallocateCOO() */
328: PetscOptionsGetInt(NULL, NULL, "-n", &user.n, NULL);
329: PetscOptionsGetInt(NULL, NULL, "-its", &user.its, NULL);
330: PetscOptionsGetBool(NULL, NULL, "-matmult", &user.matmult, NULL);
331: PetscOptionsGetBool(NULL, NULL, "-debug", &user.debug, NULL);
332: PetscOptionsGetBool(NULL, NULL, "-print_timing", &printTiming, NULL);
334: user.dim = user.n * user.n * user.n;
335: global_nnz = 64 + 27 * (user.n - 2) * (user.n - 2) * (user.n - 2) + 108 * (user.n - 2) * (user.n - 2) + 144 * (user.n - 2);
338: PetscPrintf(PETSC_COMM_WORLD, "===========================================\n");
339: if (user.matmult) PetscPrintf(PETSC_COMM_WORLD, "Test: MatMult performance - Poisson\n");
340: else PetscPrintf(PETSC_COMM_WORLD, "Test: KSP performance - Poisson\n");
341: PetscPrintf(PETSC_COMM_WORLD, "\tInput matrix: 27-pt finite difference stencil\n");
342: PetscPrintf(PETSC_COMM_WORLD, "\t-n %" PetscInt_FMT "\n", user.n);
343: if (user.matmult) PetscPrintf(PETSC_COMM_WORLD, "\t-its %" PetscInt_FMT "\n", user.its);
344: PetscPrintf(PETSC_COMM_WORLD, "\tDoFs = %" PetscInt_FMT "\n", user.dim);
345: PetscPrintf(PETSC_COMM_WORLD, "\tNumber of nonzeros = %" PetscInt_FMT "\n", global_nnz);
347: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
348: /* Create the Vecs and Mat */
349: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
350: PetscPrintf(PETSC_COMM_WORLD, "\nStep0 - creating Vecs and Mat...\n");
351: PetscLogStageRegister("Step0 - Vecs and Mat", &stage);
352: PetscLogStagePush(stage);
353: VecCreate(PETSC_COMM_WORLD, &u);
354: VecSetSizes(u, PETSC_DECIDE, user.dim);
355: VecSetFromOptions(u);
356: VecDuplicate(u, &b);
357: if (!user.matmult) VecDuplicate(b, &x);
358: VecSet(u, 1.0); /* Exact solution */
359: MatCreate(PETSC_COMM_WORLD, &A);
360: MatSetSizes(A, PETSC_DECIDE, PETSC_DECIDE, user.dim, user.dim);
361: MatSetFromOptions(A);
362: MatSetUp(A);
363: PreallocateCOO(A, &user); /* Determine local number of nonzeros */
364: FillCOO(A, &user); /* Fill COO Matrix */
365: MatMult(A, u, b); /* Compute RHS based on exact solution */
366: PetscLogStagePop();
368: if (user.matmult) {
369: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
370: /* MatMult */
371: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
372: PetscPrintf(PETSC_COMM_WORLD, "Step1 - running MatMult() %" PetscInt_FMT " times...\n", user.its);
373: PetscLogStageRegister("SpMV", &stage);
374: PetscLogStagePush(stage);
375: PetscTime(&time_start);
376: for (int i = 0; i < user.its; i++) MatMult(A, u, b);
377: PetscTime(&time_end);
378: PetscLogStagePop();
379: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
380: /* Calculate Performance metrics */
381: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
382: time_avg = (time_end - time_start) / ((PetscLogDouble)user.its);
383: floprate = 2 * global_nnz / time_avg * 1e-9;
384: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "\n%-15s%-7.5f seconds\n", "Average time:", time_avg);
385: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-9.3e Gflops/sec\n", "FOM:", floprate);
386: PetscPrintf(PETSC_COMM_WORLD, "===========================================\n");
387: } else {
388: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
389: /* Solve the linear system of equations */
390: /* Measure only time of PCSetUp() and KSPSolve() */
391: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
392: PetscPrintf(PETSC_COMM_WORLD, "Step1 - running PCSetUp()...\n");
393: PetscLogStageRegister("Step1 - PCSetUp", &stage);
394: PetscLogStagePush(stage);
395: KSPCreate(PETSC_COMM_WORLD, &ksp);
396: KSPSetOperators(ksp, A, A);
397: KSPSetFromOptions(ksp);
398: KSPGetPC(ksp, &pc);
399: PetscTime(&time_start);
400: PCSetUp(pc);
401: PetscTime(&time_mid1);
402: PetscLogStagePop();
403: PetscPrintf(PETSC_COMM_WORLD, "Step2 - running KSPSolve()...\n");
404: PetscLogStageRegister("Step2 - KSPSolve", &stage);
405: PetscLogStagePush(stage);
406: PetscTime(&time_mid2);
407: KSPSolve(ksp, b, x);
408: PetscTime(&time_end);
409: PetscLogStagePop();
411: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
412: /* Calculate error norm */
413: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
414: PetscLogStageRegister("Error norm", &stage);
415: PetscLogStagePush(stage);
416: VecAXPY(x, -1.0, u);
417: VecNorm(x, NORM_2, &norm);
418: PetscLogStagePop();
421: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
422: /* Summary */
423: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
424: KSPGetIterationNumber(ksp, &ksp_its);
425: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "\n%-15s%-1.3e\n", "Error norm:", (double)norm);
426: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-3" PetscInt_FMT "\n", "KSP iters:", ksp_its);
427: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "PCSetUp:", time_mid1 - time_start);
428: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "KSPSolve:", time_end - time_mid2);
429: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-7.5f seconds\n", "Total Solve:", time_end - time_start);
430: if (printTiming) PetscPrintf(PETSC_COMM_WORLD, "%-15s%-1.3e DoFs/sec\n", "FOM:", user.dim / (time_end - time_start));
431: PetscPrintf(PETSC_COMM_WORLD, "===========================================\n");
432: }
433: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
434: /* Free up memory */
435: /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
436: if (!user.matmult) {
437: KSPDestroy(&ksp);
438: VecDestroy(&x);
439: }
440: VecDestroy(&u);
441: VecDestroy(&b);
442: MatDestroy(&A);
443: PetscFinalize();
444: return 0;
445: }
447: /*TEST
449: testset:
450: args: -print_timing false -matmult -its 10 -n 8
451: nsize: {{1 3}}
452: output_file: output/bench_pcsetup_matmult.out
454: test:
455: suffix: matmult
457: test:
458: suffix: hip_matmult
459: requires: hip
460: args: -vec_type hip -mat_type aijhipsparse
462: test:
463: suffix: cuda_matmult
464: requires: cuda
465: args: -vec_type cuda -mat_type aijcusparse
467: test:
468: suffix: kok_matmult
469: requires: kokkos_kernels
470: args: -vec_type kokkos -mat_type aijkokkos
472: testset:
473: args: -print_timing false -its 10 -n 8
474: nsize: {{1 3}}
475: output_file: output/bench_pcsetup_ksp.out
477: test:
478: suffix: ksp
480: test:
481: suffix: hip_ksp
482: requires: hip
483: args: -vec_type hip -mat_type aijhipsparse
485: test:
486: suffix: cuda_ksp
487: requires: cuda
488: args: -vec_type cuda -mat_type aijcusparse
490: test:
491: suffix: kok_ksp
492: requires: kokkos_kernels
493: args: -vec_type kokkos -mat_type aijkokkos
494: TEST*/