Actual source code: feopencl.c

  1: #include <petsc/private/petscfeimpl.h>

  3: #if defined(PETSC_HAVE_OPENCL)

  5: static PetscErrorCode PetscFEDestroy_OpenCL(PetscFE fem)
  6: {
  7:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
  8:   PetscErrorCode  ierr;

 11:   clReleaseCommandQueue(ocl->queue_id);
 12:   ocl->queue_id = 0;
 13:   clReleaseContext(ocl->ctx_id);
 14:   ocl->ctx_id = 0;
 15:   PetscFree(ocl);
 16:   return(0);
 17: }

 19: #define CHKERRSTR(err) do {CHKERRQ(err); string_tail += count; if (string_tail == end_of_buffer) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB,"Buffer overflow");} while (0)
 20: enum {LAPLACIAN = 0, ELASTICITY = 1};

 22: /* NOTE: This is now broken for vector problems. Must redo loops to respect vector basis elements */
 23: /* dim     Number of spatial dimensions:          2                   */
 24: /* N_b     Number of basis functions:             generated           */
 25: /* N_{bt}  Number of total basis functions:       N_b * N_{comp}      */
 26: /* N_q     Number of quadrature points:           generated           */
 27: /* N_{bs}  Number of block cells                  LCM(N_b, N_q)       */
 28: /* N_{bst} Number of block cell components        LCM(N_{bt}, N_q)    */
 29: /* N_{bl}  Number of concurrent blocks            generated           */
 30: /* N_t     Number of threads:                     N_{bl} * N_{bs}     */
 31: /* N_{cbc} Number of concurrent basis      cells: N_{bl} * N_q        */
 32: /* N_{cqc} Number of concurrent quadrature cells: N_{bl} * N_b        */
 33: /* N_{sbc} Number of serial     basis      cells: N_{bs} / N_q        */
 34: /* N_{sqc} Number of serial     quadrature cells: N_{bs} / N_b        */
 35: /* N_{cb}  Number of serial cell batches:         input               */
 36: /* N_c     Number of total cells:                 N_{cb}*N_{t}/N_{comp} */
 37: static PetscErrorCode PetscFEOpenCLGenerateIntegrationCode(PetscFE fem, char **string_buffer, PetscInt buffer_length, PetscBool useAux, PetscInt N_bl)
 38: {
 39:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
 40:   PetscQuadrature q;
 41:   char           *string_tail   = *string_buffer;
 42:   char           *end_of_buffer = *string_buffer + buffer_length;
 43:   char            float_str[]   = "float", double_str[]  = "double";
 44:   char           *numeric_str   = &(float_str[0]);
 45:   PetscInt        op            = ocl->op;
 46:   PetscBool       useField      = PETSC_FALSE;
 47:   PetscBool       useFieldDer   = PETSC_TRUE;
 48:   PetscBool       useFieldAux   = useAux;
 49:   PetscBool       useFieldDerAux= PETSC_FALSE;
 50:   PetscBool       useF0         = PETSC_TRUE;
 51:   PetscBool       useF1         = PETSC_TRUE;
 52:   const PetscReal *points, *weights;
 53:   PetscTabulation T;
 54:   PetscInt        dim, qNc, N_b, N_c, N_q, N_t, p, d, b, c;
 55:   size_t          count;
 56:   PetscErrorCode  ierr;

 59:   PetscFEGetSpatialDimension(fem, &dim);
 60:   PetscFEGetDimension(fem, &N_b);
 61:   PetscFEGetNumComponents(fem, &N_c);
 62:   PetscFEGetQuadrature(fem, &q);
 63:   PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
 64:   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
 65:   N_t  = N_b * N_c * N_q * N_bl;
 66:   /* Enable device extension for double precision */
 67:   if (ocl->realType == PETSC_DOUBLE) {
 68:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 69: "#if defined(cl_khr_fp64)\n"
 70: "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
 71: "#elif defined(cl_amd_fp64)\n"
 72: "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n"
 73: "#endif\n",
 74:                               &count);CHKERRSTR(ierr);
 75:     numeric_str  = &(double_str[0]);
 76:   }
 77:   /* Kernel API */
 78:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 79: "\n"
 80: "__kernel void integrateElementQuadrature(int N_cb, __global %s *coefficients, __global %s *coefficientsAux, __global %s *jacobianInverses, __global %s *jacobianDeterminants, __global %s *elemVec)\n"
 81: "{\n",
 82:                        &count, numeric_str, numeric_str, numeric_str, numeric_str, numeric_str);CHKERRSTR(ierr);
 83:   /* Quadrature */
 84:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 85: "  /* Quadrature points\n"
 86: "   - (x1,y1,x2,y2,...) */\n"
 87: "  const %s points[%d] = {\n",
 88:                        &count, numeric_str, N_q*dim);CHKERRSTR(ierr);
 89:   for (p = 0; p < N_q; ++p) {
 90:     for (d = 0; d < dim; ++d) {
 91:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, points[p*dim+d]);CHKERRSTR(ierr);
 92:     }
 93:   }
 94:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
 95:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
 96: "  /* Quadrature weights\n"
 97: "   - (v1,v2,...) */\n"
 98: "  const %s weights[%d] = {\n",
 99:                        &count, numeric_str, N_q);CHKERRSTR(ierr);
100:   for (p = 0; p < N_q; ++p) {
101:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, weights[p]);CHKERRSTR(ierr);
102:   }
103:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
104:   /* Basis Functions */
105:   PetscFEGetCellTabulation(fem, 1, &T);
106:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
107: "  /* Nodal basis function evaluations\n"
108: "    - basis component is fastest varying, the basis function, then point */\n"
109: "  const %s Basis[%d] = {\n",
110:                        &count, numeric_str, N_q*N_b*N_c);CHKERRSTR(ierr);
111:   for (p = 0; p < N_q; ++p) {
112:     for (b = 0; b < N_b; ++b) {
113:       for (c = 0; c < N_c; ++c) {
114:         PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g,\n", &count, T->T[0][(p*N_b + b)*N_c + c]);CHKERRSTR(ierr);
115:       }
116:     }
117:   }
118:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
119:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
120: "\n"
121: "  /* Nodal basis function derivative evaluations,\n"
122: "      - derivative direction is fastest varying, then basis component, then basis function, then point */\n"
123: "  const %s%d BasisDerivatives[%d] = {\n",
124:                        &count, numeric_str, dim, N_q*N_b*N_c);CHKERRSTR(ierr);
125:   for (p = 0; p < N_q; ++p) {
126:     for (b = 0; b < N_b; ++b) {
127:       for (c = 0; c < N_c; ++c) {
128:         PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "(%s%d)(", &count, numeric_str, dim);CHKERRSTR(ierr);
129:         for (d = 0; d < dim; ++d) {
130:           if (d > 0) {
131:             PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, ", %g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr);
132:           } else {
133:             PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "%g", &count, T->T[1][((p*N_b + b)*dim + d)*N_c + c]);CHKERRSTR(ierr);
134:           }
135:         }
136:         PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "),\n", &count);CHKERRSTR(ierr);
137:       }
138:     }
139:   }
140:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "};\n", &count);CHKERRSTR(ierr);
141:   /* Sizes */
142:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
143: "  const int dim    = %d;                           // The spatial dimension\n"
144: "  const int N_bl   = %d;                           // The number of concurrent blocks\n"
145: "  const int N_b    = %d;                           // The number of basis functions\n"
146: "  const int N_comp = %d;                           // The number of basis function components\n"
147: "  const int N_bt   = N_b*N_comp;                    // The total number of scalar basis functions\n"
148: "  const int N_q    = %d;                           // The number of quadrature points\n"
149: "  const int N_bst  = N_bt*N_q;                      // The block size, LCM(N_b*N_comp, N_q), Notice that a block is not processed simultaneously\n"
150: "  const int N_t    = N_bst*N_bl;                    // The number of threads, N_bst * N_bl\n"
151: "  const int N_bc   = N_t/N_comp;                    // The number of cells per batch (N_b*N_q*N_bl)\n"
152: "  const int N_sbc  = N_bst / (N_q * N_comp);\n"
153: "  const int N_sqc  = N_bst / N_bt;\n"
154: "  /*const int N_c    = N_cb * N_bc;*/\n"
155: "\n"
156: "  /* Calculated indices */\n"
157: "  /*const int tidx    = get_local_id(0) + get_local_size(0)*get_local_id(1);*/\n"
158: "  const int tidx    = get_local_id(0);\n"
159: "  const int blidx   = tidx / N_bst;                  // Block number for this thread\n"
160: "  const int bidx    = tidx %% N_bt;                   // Basis function mapped to this thread\n"
161: "  const int cidx    = tidx %% N_comp;                 // Basis component mapped to this thread\n"
162: "  const int qidx    = tidx %% N_q;                    // Quadrature point mapped to this thread\n"
163: "  const int blbidx  = tidx %% N_q + blidx*N_q;        // Cell mapped to this thread in the basis phase\n"
164: "  const int blqidx  = tidx %% N_b + blidx*N_b;        // Cell mapped to this thread in the quadrature phase\n"
165: "  const int gidx    = get_group_id(1)*get_num_groups(0) + get_group_id(0);\n"
166: "  const int Goffset = gidx*N_cb*N_bc;\n",
167:                             &count, dim, N_bl, N_b, N_c, N_q);CHKERRSTR(ierr);
168:   /* Local memory */
169:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
170: "\n"
171: "  /* Quadrature data */\n"
172: "  %s                w;                   // $w_q$, Quadrature weight at $x_q$\n"
173: "  __local %s         phi_i[%d];    //[N_bt*N_q];  // $\\phi_i(x_q)$, Value of the basis function $i$ at $x_q$\n"
174: "  __local %s%d       phiDer_i[%d]; //[N_bt*N_q];  // $\\frac{\\partial\\phi_i(x_q)}{\\partial x_d}$, Value of the derivative of basis function $i$ in direction $x_d$ at $x_q$\n"
175: "  /* Geometric data */\n"
176: "  __local %s        detJ[%d]; //[N_t];           // $|J(x_q)|$, Jacobian determinant at $x_q$\n"
177: "  __local %s        invJ[%d];//[N_t*dim*dim];   // $J^{-1}(x_q)$, Jacobian inverse at $x_q$\n",
178:                             &count, numeric_str, numeric_str, N_b*N_c*N_q, numeric_str, dim, N_b*N_c*N_q, numeric_str, N_t,
179:                             numeric_str, N_t*dim*dim, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr);
180:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
181: "  /* FEM data */\n"
182: "  __local %s        u_i[%d]; //[N_t*N_bt];       // Coefficients $u_i$ of the field $u|_{\\mathcal{T}} = \\sum_i u_i \\phi_i$\n",
183:                             &count, numeric_str, N_t*N_b*N_c);CHKERRSTR(ierr);
184:   if (useAux) {
185:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
186: "  __local %s        a_i[%d]; //[N_t];            // Coefficients $a_i$ of the auxiliary field $a|_{\\mathcal{T}} = \\sum_i a_i \\phi^R_i$\n",
187:                             &count, numeric_str, N_t);CHKERRSTR(ierr);
188:   }
189:   if (useF0) {
190:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
191: "  /* Intermediate calculations */\n"
192: "  __local %s         f_0[%d]; //[N_t*N_sqc];      // $f_0(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
193:                               &count, numeric_str, N_t*N_q);CHKERRSTR(ierr);
194:   }
195:   if (useF1) {
196:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
197: "  __local %s%d       f_1[%d]; //[N_t*N_sqc];      // $f_1(u(x_q), \\nabla u(x_q)) |J(x_q)| w_q$\n",
198:                               &count, numeric_str, dim, N_t*N_q);CHKERRSTR(ierr);
199:   }
200:   /* TODO: If using elasticity, put in mu/lambda coefficients */
201:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
202: "  /* Output data */\n"
203: "  %s                e_i;                 // Coefficient $e_i$ of the residual\n\n",
204:                             &count, numeric_str);CHKERRSTR(ierr);
205:   /* One-time loads */
206:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
207: "  /* These should be generated inline */\n"
208: "  /* Load quadrature weights */\n"
209: "  w = weights[qidx];\n"
210: "  /* Load basis tabulation \\phi_i for this cell */\n"
211: "  if (tidx < N_bt*N_q) {\n"
212: "    phi_i[tidx]    = Basis[tidx];\n"
213: "    phiDer_i[tidx] = BasisDerivatives[tidx];\n"
214: "  }\n\n",
215:                        &count);CHKERRSTR(ierr);
216:   /* Batch loads */
217:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
218: "  for (int batch = 0; batch < N_cb; ++batch) {\n"
219: "    /* Load geometry */\n"
220: "    detJ[tidx] = jacobianDeterminants[Goffset+batch*N_bc+tidx];\n"
221: "    for (int n = 0; n < dim*dim; ++n) {\n"
222: "      const int offset = n*N_t;\n"
223: "      invJ[offset+tidx] = jacobianInverses[(Goffset+batch*N_bc)*dim*dim+offset+tidx];\n"
224: "    }\n"
225: "    /* Load coefficients u_i for this cell */\n"
226: "    for (int n = 0; n < N_bt; ++n) {\n"
227: "      const int offset = n*N_t;\n"
228: "      u_i[offset+tidx] = coefficients[(Goffset*N_bt)+batch*N_t*N_b+offset+tidx];\n"
229: "    }\n",
230:                        &count);CHKERRSTR(ierr);
231:   if (useAux) {
232:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
233: "    /* Load coefficients a_i for this cell */\n"
234: "    /* TODO: This should not be N_t here, it should be N_bc*N_comp_aux */\n"
235: "    a_i[tidx] = coefficientsAux[Goffset+batch*N_t+tidx];\n",
236:                             &count);CHKERRSTR(ierr);
237:   }
238:   /* Quadrature phase */
239:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
240: "    barrier(CLK_LOCAL_MEM_FENCE);\n"
241: "\n"
242: "    /* Map coefficients to values at quadrature points */\n"
243: "    for (int c = 0; c < N_sqc; ++c) {\n"
244: "      const int cell          = c*N_bl*N_b + blqidx;\n"
245: "      const int fidx          = (cell*N_q + qidx)*N_comp + cidx;\n",
246:                        &count);CHKERRSTR(ierr);
247:   if (useField) {
248:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
249: "      %s  u[%d]; //[N_comp];     // $u(x_q)$, Value of the field at $x_q$\n",
250:                               &count, numeric_str, N_c);CHKERRSTR(ierr);
251:   }
252:   if (useFieldDer) {
253:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
254: "      %s%d   gradU[%d]; //[N_comp]; // $\\nabla u(x_q)$, Value of the field gradient at $x_q$\n",
255:                               &count, numeric_str, dim, N_c);CHKERRSTR(ierr);
256:   }
257:   if (useFieldAux) {
258:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
259: "      %s  a[%d]; //[1];     // $a(x_q)$, Value of the auxiliary fields at $x_q$\n",
260:                               &count, numeric_str, 1);CHKERRSTR(ierr);
261:   }
262:   if (useFieldDerAux) {
263:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
264: "      %s%d   gradA[%d]; //[1]; // $\\nabla a(x_q)$, Value of the auxiliary field gradient at $x_q$\n",
265:                               &count, numeric_str, dim, 1);CHKERRSTR(ierr);
266:   }
267:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
268: "\n"
269: "      for (int comp = 0; comp < N_comp; ++comp) {\n",
270:                             &count);CHKERRSTR(ierr);
271:   if (useField) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        u[comp] = 0.0;\n", &count);CHKERRSTR(ierr);}
272:   if (useFieldDer) {
273:     switch (dim) {
274:     case 1:
275:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0;\n", &count);CHKERRSTR(ierr);break;
276:     case 2:
277:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0;\n", &count);CHKERRSTR(ierr);break;
278:     case 3:
279:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "        gradU[comp].x = 0.0; gradU[comp].y = 0.0; gradU[comp].z = 0.0;\n", &count);CHKERRSTR(ierr);break;
280:     }
281:   }
282:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
283: "      }\n",
284:                             &count);CHKERRSTR(ierr);
285:   if (useFieldAux) {
286:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      a[0] = 0.0;\n", &count);CHKERRSTR(ierr);
287:   }
288:   if (useFieldDerAux) {
289:     switch (dim) {
290:     case 1:
291:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0;\n", &count);CHKERRSTR(ierr);break;
292:     case 2:
293:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0;\n", &count);CHKERRSTR(ierr);break;
294:     case 3:
295:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      gradA[0].x = 0.0; gradA[0].y = 0.0; gradA[0].z = 0.0;\n", &count);CHKERRSTR(ierr);break;
296:     }
297:   }
298:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
299: "      /* Get field and derivatives at this quadrature point */\n"
300: "      for (int i = 0; i < N_b; ++i) {\n"
301: "        for (int comp = 0; comp < N_comp; ++comp) {\n"
302: "          const int b    = i*N_comp+comp;\n"
303: "          const int pidx = qidx*N_bt + b;\n"
304: "          const int uidx = cell*N_bt + b;\n"
305: "          %s%d   realSpaceDer;\n\n",
306:                             &count, numeric_str, dim);CHKERRSTR(ierr);
307:   if (useField) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          u[comp] += u_i[uidx]*phi_i[pidx];\n", &count);CHKERRSTR(ierr);}
308:   if (useFieldDer) {
309:     switch (dim) {
310:     case 2:
311:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
312: "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
313: "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
314: "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
315: "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n",
316:                            &count);CHKERRSTR(ierr);break;
317:     case 3:
318:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
319: "          realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
320: "          gradU[comp].x += u_i[uidx]*realSpaceDer.x;\n"
321: "          realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
322: "          gradU[comp].y += u_i[uidx]*realSpaceDer.y;\n"
323: "          realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
324: "          gradU[comp].z += u_i[uidx]*realSpaceDer.z;\n",
325:                            &count);CHKERRSTR(ierr);break;
326:     }
327:   }
328:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
329: "        }\n"
330: "      }\n",
331:                             &count);CHKERRSTR(ierr);
332:   if (useFieldAux) {
333:     PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"          a[0] += a_i[cell];\n", &count);CHKERRSTR(ierr);
334:   }
335:   /* Calculate residual at quadrature points: Should be generated by an weak form egine */
336:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
337: "      /* Process values at quadrature points */\n",
338:                             &count);CHKERRSTR(ierr);
339:   switch (op) {
340:   case LAPLACIAN:
341:     if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);}
342:     if (useF1) {
343:       if (useAux) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = a[0]*gradU[cidx];\n", &count);CHKERRSTR(ierr);}
344:       else        {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_1[fidx] = gradU[cidx];\n", &count);CHKERRSTR(ierr);}
345:     }
346:     break;
347:   case ELASTICITY:
348:     if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail, "      f_0[fidx] = 4.0;\n", &count);CHKERRSTR(ierr);}
349:     if (useF1) {
350:     switch (dim) {
351:     case 2:
352:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
353: "      switch (cidx) {\n"
354: "      case 0:\n"
355: "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].x + gradU[0].x);\n"
356: "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[0].y + gradU[1].x);\n"
357: "        break;\n"
358: "      case 1:\n"
359: "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].x + gradU[0].y);\n"
360: "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y) + mu*(gradU[1].y + gradU[1].y);\n"
361: "      }\n",
362:                            &count);CHKERRSTR(ierr);break;
363:     case 3:
364:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
365: "      switch (cidx) {\n"
366: "      case 0:\n"
367: "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].x + gradU[0].x);\n"
368: "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].y + gradU[1].x);\n"
369: "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[0].z + gradU[2].x);\n"
370: "        break;\n"
371: "      case 1:\n"
372: "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].x + gradU[0].y);\n"
373: "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[1].y);\n"
374: "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[1].y + gradU[2].y);\n"
375: "        break;\n"
376: "      case 2:\n"
377: "        f_1[fidx].x = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].x + gradU[0].z);\n"
378: "        f_1[fidx].y = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[1].z);\n"
379: "        f_1[fidx].z = lambda*(gradU[0].x + gradU[1].y + gradU[2].z) + mu*(gradU[2].y + gradU[2].z);\n"
380: "      }\n",
381:                            &count);CHKERRSTR(ierr);break;
382:     }}
383:     break;
384:   default:
385:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "PDE operator %d is not supported", op);
386:   }
387:   if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_0[fidx] *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);}
388:   if (useF1) {
389:     switch (dim) {
390:     case 1:
391:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
392:     case 2:
393:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
394:     case 3:
395:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"      f_1[fidx].x *= detJ[cell]*w; f_1[fidx].y *= detJ[cell]*w; f_1[fidx].z *= detJ[cell]*w;\n", &count);CHKERRSTR(ierr);break;
396:     }
397:   }
398:   /* Thread transpose */
399:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
400: "    }\n\n"
401: "    /* ==== TRANSPOSE THREADS ==== */\n"
402: "    barrier(CLK_LOCAL_MEM_FENCE);\n\n",
403:                        &count);CHKERRSTR(ierr);
404:   /* Basis phase */
405:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
406: "    /* Map values at quadrature points to coefficients */\n"
407: "    for (int c = 0; c < N_sbc; ++c) {\n"
408: "      const int cell = c*N_bl*N_q + blbidx; /* Cell number in batch */\n"
409: "\n"
410: "      e_i = 0.0;\n"
411: "      for (int q = 0; q < N_q; ++q) {\n"
412: "        const int pidx = q*N_bt + bidx;\n"
413: "        const int fidx = (cell*N_q + q)*N_comp + cidx;\n"
414: "        %s%d   realSpaceDer;\n\n",
415:                        &count, numeric_str, dim);CHKERRSTR(ierr);

417:   if (useF0) {PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,"        e_i += phi_i[pidx]*f_0[fidx];\n", &count);CHKERRSTR(ierr);}
418:   if (useF1) {
419:     switch (dim) {
420:     case 2:
421:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
422: "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y;\n"
423: "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
424: "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y;\n"
425: "        e_i           += realSpaceDer.y*f_1[fidx].y;\n",
426:                            &count);CHKERRSTR(ierr);break;
427:     case 3:
428:       PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
429: "        realSpaceDer.x = invJ[cell*dim*dim+0*dim+0]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+0]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+0]*phiDer_i[pidx].z;\n"
430: "        e_i           += realSpaceDer.x*f_1[fidx].x;\n"
431: "        realSpaceDer.y = invJ[cell*dim*dim+0*dim+1]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+1]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+1]*phiDer_i[pidx].z;\n"
432: "        e_i           += realSpaceDer.y*f_1[fidx].y;\n"
433: "        realSpaceDer.z = invJ[cell*dim*dim+0*dim+2]*phiDer_i[pidx].x + invJ[cell*dim*dim+1*dim+2]*phiDer_i[pidx].y + invJ[cell*dim*dim+2*dim+2]*phiDer_i[pidx].z;\n"
434: "        e_i           += realSpaceDer.z*f_1[fidx].z;\n",
435:                            &count);CHKERRSTR(ierr);break;
436:     }
437:   }
438:   PetscSNPrintfCount(string_tail, end_of_buffer - string_tail,
439: "      }\n"
440: "      /* Write element vector for N_{cbc} cells at a time */\n"
441: "      elemVec[(Goffset + batch*N_bc + c*N_bl*N_q)*N_bt + tidx] = e_i;\n"
442: "    }\n"
443: "    /* ==== Could do one write per batch ==== */\n"
444: "  }\n"
445: "  return;\n"
446: "}\n",
447:                        &count);CHKERRSTR(ierr);
448:   return(0);
449: }

451: static PetscErrorCode PetscFEOpenCLGetIntegrationKernel(PetscFE fem, PetscBool useAux, cl_program *ocl_prog, cl_kernel *ocl_kernel)
452: {
453:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;
454:   PetscInt        dim, N_bl;
455:   PetscBool       flg;
456:   char           *buffer;
457:   size_t          len;
458:   char            errMsg[8192];
459:   cl_int          err;
460:   PetscErrorCode  ierr;

463:   PetscFEGetSpatialDimension(fem, &dim);
464:   PetscMalloc1(8192, &buffer);
465:   PetscFEGetTileSizes(fem, NULL, &N_bl, NULL, NULL);
466:   PetscFEOpenCLGenerateIntegrationCode(fem, &buffer, 8192, useAux, N_bl);
467:   PetscOptionsHasName(((PetscObject)fem)->options,((PetscObject)fem)->prefix, "-petscfe_opencl_kernel_print", &flg);
468:   if (flg) {PetscPrintf(PetscObjectComm((PetscObject) fem), "OpenCL FE Integration Kernel:\n%s\n", buffer);}
469:   PetscStrlen(buffer,&len);
470:   *ocl_prog = clCreateProgramWithSource(ocl->ctx_id, 1, (const char **) &buffer, &len, &err);CHKERRQ(err);
471:   err = clBuildProgram(*ocl_prog, 0, NULL, NULL, NULL, NULL);
472:   if (err != CL_SUCCESS) {
473:     err = clGetProgramBuildInfo(*ocl_prog, ocl->dev_id, CL_PROGRAM_BUILD_LOG, 8192*sizeof(char), &errMsg, NULL);
474:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Build failed! Log:\n %s", errMsg);
475:   }
476:   PetscFree(buffer);
477:   *ocl_kernel = clCreateKernel(*ocl_prog, "integrateElementQuadrature", &ierr);
478:   return(0);
479: }

481: static PetscErrorCode PetscFEOpenCLCalculateGrid(PetscFE fem, PetscInt N, PetscInt blockSize, size_t *x, size_t *y, size_t *z)
482: {
483:   const PetscInt Nblocks = N/blockSize;

486:   if (N % blockSize) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid block size %d for %d elements", blockSize, N);
487:   *z = 1;
488:   *y = 1;
489:   for (*x = (size_t) (PetscSqrtReal(Nblocks) + 0.5); *x > 0; --*x) {
490:     *y = Nblocks / *x;
491:     if (*x * *y == (size_t)Nblocks) break;
492:   }
493:   if (*x * *y != (size_t)Nblocks) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Could not find partition for %D with block size %D", N, blockSize);
494:   return(0);
495: }

497: static PetscErrorCode PetscFEOpenCLLogResidual(PetscFE fem, PetscLogDouble time, PetscLogDouble flops)
498: {
499:   PetscFE_OpenCL   *ocl = (PetscFE_OpenCL *) fem->data;
500:   PetscStageLog     stageLog;
501:   PetscEventPerfLog eventLog = NULL;
502:   int               stage;
503:   PetscErrorCode    ierr;

506:   PetscLogGetStageLog(&stageLog);
507:   PetscStageLogGetCurrent(stageLog, &stage);
508:   PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);
509:     /* Log performance info */
510:   eventLog->eventInfo[ocl->residualEvent].count++;
511:   eventLog->eventInfo[ocl->residualEvent].time  += time;
512:   eventLog->eventInfo[ocl->residualEvent].flops += flops;
513:   return(0);
514: }

516: static PetscErrorCode PetscFEIntegrateResidual_OpenCL(PetscDS prob, PetscFormKey key, PetscInt Ne, PetscFEGeom *cgeom,
517:                                                       const PetscScalar coefficients[], const PetscScalar coefficients_t[], PetscDS probAux, const PetscScalar coefficientsAux[], PetscReal t, PetscScalar elemVec[])
518: {
519:   /* Nbc = batchSize */
520:   PetscFE           fem;
521:   PetscFE_OpenCL   *ocl;
522:   PetscPointFunc    f0_func;
523:   PetscPointFunc    f1_func;
524:   PetscQuadrature   q;
525:   PetscInt          dim, qNc;
526:   PetscInt          N_b;    /* The number of basis functions */
527:   PetscInt          N_comp; /* The number of basis function components */
528:   PetscInt          N_bt;   /* The total number of scalar basis functions */
529:   PetscInt          N_q;    /* The number of quadrature points */
530:   PetscInt          N_bst;  /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
531:   PetscInt          N_t;    /* The number of threads, N_bst * N_bl */
532:   PetscInt          N_bl;   /* The number of blocks */
533:   PetscInt          N_bc;   /* The batch size, N_bl*N_q*N_b */
534:   PetscInt          N_cb;   /* The number of batches */
535:   const PetscInt    field = key.field;
536:   PetscInt          numFlops, f0Flops = 0, f1Flops = 0;
537:   PetscBool         useAux      = probAux ? PETSC_TRUE : PETSC_FALSE;
538:   PetscBool         useField    = PETSC_FALSE;
539:   PetscBool         useFieldDer = PETSC_TRUE;
540:   PetscBool         useF0       = PETSC_TRUE;
541:   PetscBool         useF1       = PETSC_TRUE;
542:   /* OpenCL variables */
543:   cl_program        ocl_prog;
544:   cl_kernel         ocl_kernel;
545:   cl_event          ocl_ev;         /* The event for tracking kernel execution */
546:   cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
547:   cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */
548:   cl_mem            o_jacobianInverses, o_jacobianDeterminants;
549:   cl_mem            o_coefficients, o_coefficientsAux, o_elemVec;
550:   float            *f_coeff = NULL, *f_coeffAux = NULL, *f_invJ = NULL, *f_detJ = NULL;
551:   double           *d_coeff = NULL, *d_coeffAux = NULL, *d_invJ = NULL, *d_detJ = NULL;
552:   PetscReal        *r_invJ = NULL, *r_detJ = NULL;
553:   void             *oclCoeff, *oclCoeffAux, *oclInvJ, *oclDetJ;
554:   size_t            local_work_size[3], global_work_size[3];
555:   size_t            realSize, x, y, z;
556:   const PetscReal   *points, *weights;
557:   PetscErrorCode    ierr;

560:   PetscDSGetDiscretization(prob, field, (PetscObject *) &fem);
561:   ocl  = (PetscFE_OpenCL *) fem->data;
562:   if (!Ne) {PetscFEOpenCLLogResidual(fem, 0.0, 0.0); return(0);}
563:   PetscFEGetSpatialDimension(fem, &dim);
564:   PetscFEGetQuadrature(fem, &q);
565:   PetscQuadratureGetData(q, NULL, &qNc, &N_q, &points, &weights);
566:   if (qNc != 1) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only supports scalar quadrature, not %D components\n", qNc);
567:   PetscFEGetDimension(fem, &N_b);
568:   PetscFEGetNumComponents(fem, &N_comp);
569:   PetscDSGetResidual(prob, field, &f0_func, &f1_func);
570:   PetscFEGetTileSizes(fem, NULL, &N_bl, &N_bc, &N_cb);
571:   N_bt  = N_b*N_comp;
572:   N_bst = N_bt*N_q;
573:   N_t   = N_bst*N_bl;
574:   if (N_bc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, N_bc, N_comp);
575:   /* Calculate layout */
576:   if (Ne % (N_cb*N_bc)) { /* Remainder cells */
577:     PetscFEIntegrateResidual_Basic(prob, key, Ne, cgeom, coefficients, coefficients_t, probAux, coefficientsAux, t, elemVec);
578:     return(0);
579:   }
580:   PetscFEOpenCLCalculateGrid(fem, Ne, N_cb*N_bc, &x, &y, &z);
581:   local_work_size[0]  = N_bc*N_comp;
582:   local_work_size[1]  = 1;
583:   local_work_size[2]  = 1;
584:   global_work_size[0] = x * local_work_size[0];
585:   global_work_size[1] = y * local_work_size[1];
586:   global_work_size[2] = z * local_work_size[2];
587:   PetscInfo7(fem, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], N_cb);
588:   PetscInfo2(fem, " N_t: %d, N_cb: %d\n", N_t, N_cb);
589:   /* Generate code */
590:   if (probAux) {
591:     PetscSpace P;
592:     PetscInt   NfAux, order, f;

594:     PetscDSGetNumFields(probAux, &NfAux);
595:     for (f = 0; f < NfAux; ++f) {
596:       PetscFE feAux;

598:       PetscDSGetDiscretization(probAux, f, (PetscObject *) &feAux);
599:       PetscFEGetBasisSpace(feAux, &P);
600:       PetscSpaceGetDegree(P, &order, NULL);
601:       if (order > 0) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Can only handle P0 coefficient fields");
602:     }
603:   }
604:   PetscFEOpenCLGetIntegrationKernel(fem, useAux, &ocl_prog, &ocl_kernel);
605:   /* Create buffers on the device and send data over */
606:   PetscDataTypeGetSize(ocl->realType, &realSize);
607:   if (cgeom->numPoints > 1) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Only support affine geometry for OpenCL integration right now");
608:   if (sizeof(PetscReal) != realSize) {
609:     switch (ocl->realType) {
610:     case PETSC_FLOAT:
611:     {
612:       PetscInt c, b, d;

614:       PetscMalloc4(Ne*N_bt,&f_coeff,Ne,&f_coeffAux,Ne*dim*dim,&f_invJ,Ne,&f_detJ);
615:       for (c = 0; c < Ne; ++c) {
616:         f_detJ[c] = (float) cgeom->detJ[c];
617:         for (d = 0; d < dim*dim; ++d) {
618:           f_invJ[c*dim*dim+d] = (float) cgeom->invJ[c * dim * dim + d];
619:         }
620:         for (b = 0; b < N_bt; ++b) {
621:           f_coeff[c*N_bt+b] = (float) coefficients[c*N_bt+b];
622:         }
623:       }
624:       if (coefficientsAux) { /* Assume P0 */
625:         for (c = 0; c < Ne; ++c) {
626:           f_coeffAux[c] = (float) coefficientsAux[c];
627:         }
628:       }
629:       oclCoeff      = (void *) f_coeff;
630:       if (coefficientsAux) {
631:         oclCoeffAux = (void *) f_coeffAux;
632:       } else {
633:         oclCoeffAux = NULL;
634:       }
635:       oclInvJ       = (void *) f_invJ;
636:       oclDetJ       = (void *) f_detJ;
637:     }
638:     break;
639:     case PETSC_DOUBLE:
640:     {
641:       PetscInt c, b, d;

643:       PetscMalloc4(Ne*N_bt,&d_coeff,Ne,&d_coeffAux,Ne*dim*dim,&d_invJ,Ne,&d_detJ);
644:       for (c = 0; c < Ne; ++c) {
645:         d_detJ[c] = (double) cgeom->detJ[c];
646:         for (d = 0; d < dim*dim; ++d) {
647:           d_invJ[c*dim*dim+d] = (double) cgeom->invJ[c * dim * dim + d];
648:         }
649:         for (b = 0; b < N_bt; ++b) {
650:           d_coeff[c*N_bt+b] = (double) coefficients[c*N_bt+b];
651:         }
652:       }
653:       if (coefficientsAux) { /* Assume P0 */
654:         for (c = 0; c < Ne; ++c) {
655:           d_coeffAux[c] = (double) coefficientsAux[c];
656:         }
657:       }
658:       oclCoeff      = (void *) d_coeff;
659:       if (coefficientsAux) {
660:         oclCoeffAux = (void *) d_coeffAux;
661:       } else {
662:         oclCoeffAux = NULL;
663:       }
664:       oclInvJ       = (void *) d_invJ;
665:       oclDetJ       = (void *) d_detJ;
666:     }
667:     break;
668:     default:
669:       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
670:     }
671:   } else {
672:     PetscInt c, d;

674:     PetscMalloc2(Ne*dim*dim,&r_invJ,Ne,&r_detJ);
675:     for (c = 0; c < Ne; ++c) {
676:       r_detJ[c] = cgeom->detJ[c];
677:       for (d = 0; d < dim*dim; ++d) {
678:         r_invJ[c*dim*dim+d] = cgeom->invJ[c * dim * dim + d];
679:       }
680:     }
681:     oclCoeff    = (void *) coefficients;
682:     oclCoeffAux = (void *) coefficientsAux;
683:     oclInvJ     = (void *) r_invJ;
684:     oclDetJ     = (void *) r_detJ;
685:   }
686:   o_coefficients         = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * realSize, oclCoeff,    &ierr);
687:   if (coefficientsAux) {
688:     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclCoeffAux, &ierr);
689:   } else {
690:     o_coefficientsAux    = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY,                        Ne         * realSize, oclCoeffAux, &ierr);
691:   }
692:   o_jacobianInverses     = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * realSize, oclInvJ,     &ierr);
693:   o_jacobianDeterminants = clCreateBuffer(ocl->ctx_id, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Ne         * realSize, oclDetJ,     &ierr);
694:   o_elemVec              = clCreateBuffer(ocl->ctx_id, CL_MEM_WRITE_ONLY,                       Ne*N_bt    * realSize, NULL,        &ierr);
695:   /* Kernel launch */
696:   clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*) &N_cb);
697:   clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*) &o_coefficients);
698:   clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*) &o_coefficientsAux);
699:   clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*) &o_jacobianInverses);
700:   clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*) &o_jacobianDeterminants);
701:   clSetKernelArg(ocl_kernel, 5, sizeof(cl_mem), (void*) &o_elemVec);
702:   clEnqueueNDRangeKernel(ocl->queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);
703:   /* Read data back from device */
704:   if (sizeof(PetscReal) != realSize) {
705:     switch (ocl->realType) {
706:     case PETSC_FLOAT:
707:     {
708:       float   *elem;
709:       PetscInt c, b;

711:       PetscFree4(f_coeff,f_coeffAux,f_invJ,f_detJ);
712:       PetscMalloc1(Ne*N_bt, &elem);
713:       clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
714:       for (c = 0; c < Ne; ++c) {
715:         for (b = 0; b < N_bt; ++b) {
716:           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
717:         }
718:       }
719:       PetscFree(elem);
720:     }
721:     break;
722:     case PETSC_DOUBLE:
723:     {
724:       double  *elem;
725:       PetscInt c, b;

727:       PetscFree4(d_coeff,d_coeffAux,d_invJ,d_detJ);
728:       PetscMalloc1(Ne*N_bt, &elem);
729:       clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elem, 0, NULL, NULL);
730:       for (c = 0; c < Ne; ++c) {
731:         for (b = 0; b < N_bt; ++b) {
732:           elemVec[c*N_bt+b] = (PetscScalar) elem[c*N_bt+b];
733:         }
734:       }
735:       PetscFree(elem);
736:     }
737:     break;
738:     default:
739:       SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Unsupported PETSc type %d", ocl->realType);
740:     }
741:   } else {
742:     PetscFree2(r_invJ,r_detJ);
743:     clEnqueueReadBuffer(ocl->queue_id, o_elemVec, CL_TRUE, 0, Ne*N_bt * realSize, elemVec, 0, NULL, NULL);
744:   }
745:   /* Log performance */
746:   clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);
747:   clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);
748:   f0Flops = 0;
749:   switch (ocl->op) {
750:   case LAPLACIAN:
751:     f1Flops = useAux ? dim : 0;break;
752:   case ELASTICITY:
753:     f1Flops = 2*dim*dim;break;
754:   }
755:   numFlops = Ne*(
756:     N_q*(
757:       N_b*N_comp*((useField ? 2 : 0) + (useFieldDer ? 2*dim*(dim + 1) : 0))
758:       /*+
759:        N_ba*N_compa*((useFieldAux ? 2 : 0) + (useFieldDerAux ? 2*dim*(dim + 1) : 0))*/
760:       +
761:       N_comp*((useF0 ? f0Flops + 2 : 0) + (useF1 ? f1Flops + 2*dim : 0)))
762:     +
763:     N_b*((useF0 ? 2 : 0) + (useF1 ? 2*dim*(dim + 1) : 0)));
764:   PetscFEOpenCLLogResidual(fem, (ns_end - ns_start)*1.0e-9, numFlops);
765:   /* Cleanup */
766:   clReleaseMemObject(o_coefficients);
767:   clReleaseMemObject(o_coefficientsAux);
768:   clReleaseMemObject(o_jacobianInverses);
769:   clReleaseMemObject(o_jacobianDeterminants);
770:   clReleaseMemObject(o_elemVec);
771:   clReleaseKernel(ocl_kernel);
772:   clReleaseProgram(ocl_prog);
773:   return(0);
774: }

776: PETSC_INTERN PetscErrorCode PetscFESetUp_Basic(PetscFE);
777: PETSC_INTERN PetscErrorCode PetscFECreateTabulation_Basic(PetscFE, PetscInt, const PetscReal [], PetscInt, PetscTabulation);

779: static PetscErrorCode PetscFEInitialize_OpenCL(PetscFE fem)
780: {
782:   fem->ops->setfromoptions          = NULL;
783:   fem->ops->setup                   = PetscFESetUp_Basic;
784:   fem->ops->view                    = NULL;
785:   fem->ops->destroy                 = PetscFEDestroy_OpenCL;
786:   fem->ops->getdimension            = PetscFEGetDimension_Basic;
787:   fem->ops->createtabulation        = PetscFECreateTabulation_Basic;
788:   fem->ops->integrateresidual       = PetscFEIntegrateResidual_OpenCL;
789:   fem->ops->integratebdresidual     = NULL/* PetscFEIntegrateBdResidual_OpenCL */;
790:   fem->ops->integratejacobianaction = NULL/* PetscFEIntegrateJacobianAction_OpenCL */;
791:   fem->ops->integratejacobian       = PetscFEIntegrateJacobian_Basic;
792:   return(0);
793: }

795: /*MC
796:   PETSCFEOPENCL = "opencl" - A PetscFE object that integrates using a vectorized OpenCL implementation

798:   Level: intermediate

800: .seealso: PetscFEType, PetscFECreate(), PetscFESetType()
801: M*/

803: PETSC_EXTERN PetscErrorCode PetscFECreate_OpenCL(PetscFE fem)
804: {
805:   PetscFE_OpenCL *ocl;
806:   cl_uint         num_platforms;
807:   cl_platform_id  platform_ids[42];
808:   cl_uint         num_devices;
809:   cl_device_id    device_ids[42];
810:   cl_int          err;
811:   PetscErrorCode  ierr;

815:   PetscNewLog(fem,&ocl);
816:   fem->data = ocl;

818:   /* Init Platform */
819:   clGetPlatformIDs(42, platform_ids, &num_platforms);
820:   if (!num_platforms) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL platform found.");
821:   ocl->pf_id = platform_ids[0];
822:   /* Init Device */
823:   clGetDeviceIDs(ocl->pf_id, CL_DEVICE_TYPE_ALL, 42, device_ids, &num_devices);
824:   if (!num_devices) SETERRQ(PetscObjectComm((PetscObject) fem), PETSC_ERR_SUP, "No OpenCL device found.");
825:   ocl->dev_id = device_ids[0];
826:   /* Create context with one command queue */
827:   ocl->ctx_id   = clCreateContext(0, 1, &(ocl->dev_id), NULL, NULL, &err);CHKERRQ(err);
828:   ocl->queue_id = clCreateCommandQueue(ocl->ctx_id, ocl->dev_id, CL_QUEUE_PROFILING_ENABLE, &err);CHKERRQ(err);
829:   /* Types */
830:   ocl->realType = PETSC_FLOAT;
831:   /* Register events */
832:   PetscLogEventRegister("OpenCL FEResidual", PETSCFE_CLASSID, &ocl->residualEvent);
833:   /* Equation handling */
834:   ocl->op = LAPLACIAN;

836:   PetscFEInitialize_OpenCL(fem);
837:   return(0);
838: }

840: /*@
841:   PetscFEOpenCLSetRealType - Set the scalar type for running on the accelerator

843:   Input Parameters:
844: + fem      - The PetscFE
845: - realType - The scalar type

847:   Level: developer

849: .seealso: PetscFEOpenCLGetRealType()
850: @*/
851: PetscErrorCode PetscFEOpenCLSetRealType(PetscFE fem, PetscDataType realType)
852: {
853:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;

857:   ocl->realType = realType;
858:   return(0);
859: }

861: /*@
862:   PetscFEOpenCLGetRealType - Get the scalar type for running on the accelerator

864:   Input Parameter:
865: . fem      - The PetscFE

867:   Output Parameter:
868: . realType - The scalar type

870:   Level: developer

872: .seealso: PetscFEOpenCLSetRealType()
873: @*/
874: PetscErrorCode PetscFEOpenCLGetRealType(PetscFE fem, PetscDataType *realType)
875: {
876:   PetscFE_OpenCL *ocl = (PetscFE_OpenCL *) fem->data;

881:   *realType = ocl->realType;
882:   return(0);
883: }

885: #endif /* PETSC_HAVE_OPENCL */