Actual source code: sellcuda.cu

  1: #include <petscdevice_cuda.h>
  2: #include <petsc/private/cupmatomics.hpp>
  3: #include <../src/mat/impls/sell/seq/sell.h>

  5: #define SLICE_HEIGHT 16

  7: typedef struct {
  8:   PetscInt   maxallocmat;
  9:   PetscInt   totalentries;
 10:   PetscInt  *colidx; /* column index array, device pointer */
 11:   MatScalar *val;    /* value array, device pointer */
 12:   PetscInt   totalslices;
 13:   PetscInt  *sliidx; /* slice index array, device pointer */
 14:   PetscInt   nonzerostate;
 15:   PetscInt   kernelchoice;
 16:   PetscInt   blocky;
 17:   PetscInt   chunksperblock;
 18:   PetscInt   totalchunks;
 19:   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
 20: } Mat_SeqSELLCUDA;

 22: static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
 23: {
 24:   PetscFunctionBegin;
 25:   if (*cudastruct) {
 26:     if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx));
 27:     if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val));
 28:     if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx));
 29:     if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map));
 30:     PetscCall(PetscFree(*cudastruct));
 31:   }
 32:   PetscFunctionReturn(PETSC_SUCCESS);
 33: }

 35: static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
 36: {
 37:   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
 38:   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;

 40:   PetscFunctionBegin;
 41:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
 42:     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
 43:     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
 44:       /* copy values only */
 45:       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
 46:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
 47:     } else {
 48:       if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx));
 49:       if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val));
 50:       if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx));
 51:       if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map));
 52:       cudastruct->maxallocmat  = a->maxallocmat;
 53:       cudastruct->totalentries = a->sliidx[a->totalslices];
 54:       cudastruct->totalslices  = a->totalslices;
 55:       cudastruct->totalchunks  = a->totalchunks;
 56:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
 57:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
 58:       /* copy values, nz or maxallocmat? */
 59:       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
 60:       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));

 62:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
 63:       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
 64:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
 65:       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
 66:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
 67:     }
 68:     PetscCallCUDA(WaitForCUDA());
 69:     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
 70:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 71:   }
 72:   PetscFunctionReturn(PETSC_SUCCESS);
 73: }

 75: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
 76: {
 77:   PetscInt  i, row, slice_id, row_in_slice;
 78:   MatScalar sum;
 79:   /* one thread per row. */
 80:   row = blockIdx.x * blockDim.x + threadIdx.x;
 81:   if (row < nrows) {
 82:     slice_id     = row / sliceheight;
 83:     row_in_slice = row % sliceheight;
 84:     sum          = 0.0;
 85:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
 86:     y[row] = sum;
 87:   }
 88: }

 90: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
 91: {
 92:   PetscInt  i, row, slice_id, row_in_slice;
 93:   MatScalar sum;
 94:   /* one thread per row. */
 95:   row = blockIdx.x * blockDim.x + threadIdx.x;
 96:   if (row < nrows) {
 97:     slice_id     = row / sliceheight;
 98:     row_in_slice = row % sliceheight;
 99:     sum          = 0.0;
100:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
101:     z[row] = y[row] + sum;
102:   }
103: }

105: #if !defined(PETSC_USE_COMPLEX)
106: /* use 1 block per slice, suitable for large slice width */
107: template <int BLOCKY>
108: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
109: {
110:   __shared__ MatScalar shared[32][BLOCKY];
111:   PetscInt             i, row, slice_id = blockIdx.x;
112:   int                  tid = threadIdx.x + threadIdx.y * 32;
113:   /* transposed index */
114:   int         tidx = tid % BLOCKY;
115:   int         tidy = tid / BLOCKY;
116:   PetscScalar t    = 0.0;

118:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
119:   if (row < nrows) {
120:     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
121:   }
122:   #pragma unroll
123:   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
124:   /* transpose layout to reduce each row using warp shfl */
125:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
126:   __syncthreads();
127:   if (tidy < sliceheight) t = shared[tidy][tidx];
128:   #pragma unroll
129:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
130:   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
131:   __syncthreads();
132:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
133: }

135: /* use 1 block per slice, suitable for large slice width */
136: template <int BLOCKY>
137: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138: {
139:   __shared__ MatScalar shared[32][BLOCKY];
140:   PetscInt             i, row, slice_id = blockIdx.x;
141:   int                  tid = threadIdx.x + threadIdx.y * 32;
142:   /* transposed index */
143:   int         tidx = tid % BLOCKY;
144:   int         tidy = tid / BLOCKY;
145:   PetscScalar t    = 0.0;

147:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
148:   if (row < nrows) {
149:     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150:   }
151:   #pragma unroll
152:   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
153:   /* transpose layout to reduce each row using warp shfl */
154:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155:   __syncthreads();
156:   if (tidy < sliceheight) t = shared[tidy][tidx];
157:   #pragma unroll
158:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
159:   if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
160:   __syncthreads();
161:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
162: }

164: template <int BLOCKY>
165: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166: {
167:   bool head = true;
168:   #pragma unroll
169:   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170:     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
171:     shared[threadIdx.x + threadIdx.y * 32] = 0;
172:     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173:       shared[threadIdx.x + threadIdx.y * 32] = *val;
174:       if (i == 1) head = false;
175:     }
176:     __syncthreads();
177:     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178:     __syncthreads();
179:   }
180:   return head;
181: }

183: /* load-balancing version. Chunksize is equal to the number of threads per block */
184: template <int BLOCKY>
185: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186: {
187:   __shared__ MatScalar shared[BLOCKY * 32];
188:   PetscInt             gid, row, start_slice, cid;
189:   PetscScalar          t = 0.0;
190:   AtomicAdd<MatScalar> atomAdd;
191:   /* zero out y */
192:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
193:     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
194:     if (gid < nrows) y[gid] = 0.0;
195:   }
196:   for (int iter = 0; iter < chunksperblock; iter++) {
197:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
198:     if (cid < totalchunks) {
199:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
200:       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
201:       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
202:         __shared__ PetscInt flag[BLOCKY * 2];
203:         bool                write;
204:         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
205:         /* find out the slice that this element belongs to */
206:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
207:         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
208:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
209:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
210:         __syncthreads();
211:         write = segment_scan<BLOCKY>(flag, shared, &t);
212:         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
213:         t = 0.0;
214:       } else { /* this iteration covers only one slice */
215:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
216:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
217:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
218:           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
219:   /* reduction and write to output vector */
220:   #pragma unroll
221:           for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
222:           /* transpose layout to reduce each row using warp shfl */
223:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
224:           __syncthreads();
225:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
226:   #pragma unroll
227:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
228:           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
229:           __syncthreads();
230:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
231:           t = 0.0;
232:         }
233:       }
234:     }
235:   }
236: }

238: /* load-balancing version. Chunksize is equal to the number of threads per block */
239: template <int BLOCKY>
240: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
241: {
242:   __shared__ MatScalar shared[BLOCKY * 32];
243:   PetscInt             gid, row, start_slice, cid;
244:   PetscScalar          t = 0.0;
245:   AtomicAdd<MatScalar> atomAdd;
246:   /* copy y to z */
247:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
248:     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
249:     if (gid < nrows) z[gid] = y[gid];
250:   }
251:   for (int iter = 0; iter < chunksperblock; iter++) {
252:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
253:     if (cid < totalchunks) {
254:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
255:       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
256:       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
257:         __shared__ PetscInt flag[BLOCKY * 2];
258:         bool                write;
259:         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
260:         /* find out the slice that this element belongs to */
261:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
262:         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
263:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
264:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
265:         __syncthreads();
266:         write = segment_scan<BLOCKY>(flag, shared, &t);
267:         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
268:         t = 0.0;
269:       } else { /* this iteration covers only one slice */
270:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
271:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
272:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
273:           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
274:   /* reduction and write to output vector */
275:   #pragma unroll
276:           for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
277:           /* transpose layout to reduce each row using warp shfl */
278:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
279:           __syncthreads();
280:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
281:   #pragma unroll
282:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
283:           if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
284:           __syncthreads();
285:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
286:           t = 0.0;
287:         }
288:       }
289:     }
290:   }
291: }

293: /* use 1 warp per slice, suitable for small slice width */
294: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
295: {
296:   PetscInt i, row, slice_id;
297:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
298:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
299:   double t = 0.0;
300:   if (row < nrows) {
301:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
302:   }
303:   #pragma unroll
304:   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
305:   if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
306: }

308: /* use 1 warp per slice, suitable for small slice width */
309: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
310: {
311:   PetscInt i, row, slice_id;
312:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
313:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
314:   double t = 0.0;
315:   if (row < nrows) {
316:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
317:   }
318:   #pragma unroll
319:   for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
320:   if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
321: }
322: #endif

324: /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/

326: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
327: {
328:   __shared__ MatScalar shared[512];
329:   PetscInt             i, row, slice_id, row_in_slice;
330:   /* multiple threads per row. */
331:   row = blockIdx.x * blockDim.x + threadIdx.x;
332:   if (row < nrows) {
333:     slice_id     = row / SLICE_HEIGHT;
334:     row_in_slice = row % SLICE_HEIGHT;

336:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
337:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
338:     __syncthreads();
339:     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
340:     __syncthreads();
341:     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
342:     __syncthreads();
343:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
344:     __syncthreads();
345:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
346:     __syncthreads();
347:     if (threadIdx.y < 1) {
348:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349:       y[row] = shared[threadIdx.x];
350:     }
351:   }
352: }

354: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
355: {
356:   __shared__ MatScalar shared[512];
357:   PetscInt             i, row, slice_id, row_in_slice;
358:   /* multiple threads per row. */
359:   row = blockIdx.x * blockDim.x + threadIdx.x;
360:   if (row < nrows) {
361:     slice_id     = row / SLICE_HEIGHT;
362:     row_in_slice = row % SLICE_HEIGHT;

364:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366:     __syncthreads();
367:     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
368:     __syncthreads();
369:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
370:     __syncthreads();
371:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
372:     __syncthreads();
373:     if (threadIdx.y < 1) {
374:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
375:       y[row] = shared[threadIdx.x];
376:     }
377:   }
378: }

380: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
381: {
382:   __shared__ MatScalar shared[512];
383:   PetscInt             i, row, slice_id, row_in_slice;
384:   /* multiple threads per row. */
385:   row = blockIdx.x * blockDim.x + threadIdx.x;
386:   if (row < nrows) {
387:     slice_id     = row / SLICE_HEIGHT;
388:     row_in_slice = row % SLICE_HEIGHT;

390:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
391:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
392:     __syncthreads();
393:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
394:     __syncthreads();
395:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
396:     __syncthreads();
397:     if (threadIdx.y < 1) {
398:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
399:       y[row] = shared[threadIdx.x];
400:     }
401:   }
402: }

404: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
405: {
406:   __shared__ MatScalar shared[512];
407:   PetscInt             i, row, slice_id, row_in_slice;
408:   /* multiple threads per row. */
409:   row = blockIdx.x * blockDim.x + threadIdx.x;
410:   if (row < nrows) {
411:     slice_id     = row / SLICE_HEIGHT;
412:     row_in_slice = row % SLICE_HEIGHT;

414:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
415:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
416:     __syncthreads();
417:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
418:     __syncthreads();
419:     if (threadIdx.y < 1) {
420:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
421:       y[row] = shared[threadIdx.x];
422:     }
423:   }
424: }

426: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
427: {
428:   __shared__ MatScalar shared[512];
429:   PetscInt             i, row, slice_id, row_in_slice;
430:   /* multiple threads per row. */
431:   row = blockIdx.x * blockDim.x + threadIdx.x;
432:   if (row < nrows) {
433:     slice_id     = row / SLICE_HEIGHT;
434:     row_in_slice = row % SLICE_HEIGHT;

436:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
437:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
438:     __syncthreads();
439:     if (threadIdx.y < 1) {
440:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
441:       y[row] = shared[threadIdx.x];
442:     }
443:   }
444: }

446: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
447: {
448:   __shared__ MatScalar shared[512];
449:   PetscInt             i, row, slice_id, row_in_slice;
450:   /* multiple threads per row. */
451:   row = blockIdx.x * blockDim.x + threadIdx.x;
452:   if (row < nrows) {
453:     slice_id     = row / SLICE_HEIGHT;
454:     row_in_slice = row % SLICE_HEIGHT;

456:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
457:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
458:     __syncthreads();
459:     if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
460:     __syncthreads();
461:     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
462:     __syncthreads();
463:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
464:     __syncthreads();
465:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
466:     __syncthreads();
467:     if (threadIdx.y < 1) {
468:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
469:       z[row] = y[row] + shared[threadIdx.x];
470:     }
471:   }
472: }

474: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
475: {
476:   __shared__ MatScalar shared[512];
477:   PetscInt             i, row, slice_id, row_in_slice;
478:   /* multiple threads per row. */
479:   row = blockIdx.x * blockDim.x + threadIdx.x;
480:   if (row < nrows) {
481:     slice_id     = row / SLICE_HEIGHT;
482:     row_in_slice = row % SLICE_HEIGHT;

484:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
485:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
486:     __syncthreads();
487:     if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
488:     __syncthreads();
489:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
490:     __syncthreads();
491:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
492:     __syncthreads();
493:     if (threadIdx.y < 1) {
494:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
495:       z[row] = y[row] + shared[threadIdx.x];
496:     }
497:   }
498: }

500: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
501: {
502:   __shared__ MatScalar shared[512];
503:   PetscInt             i, row, slice_id, row_in_slice;
504:   /* multiple threads per row. */
505:   row = blockIdx.x * blockDim.x + threadIdx.x;
506:   if (row < nrows) {
507:     slice_id     = row / SLICE_HEIGHT;
508:     row_in_slice = row % SLICE_HEIGHT;

510:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
511:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
512:     __syncthreads();
513:     if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
514:     __syncthreads();
515:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
516:     __syncthreads();
517:     if (threadIdx.y < 1) {
518:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
519:       z[row] = y[row] + shared[threadIdx.x];
520:     }
521:   }
522: }

524: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
525: {
526:   __shared__ MatScalar shared[512];
527:   PetscInt             i, row, slice_id, row_in_slice;
528:   /* multiple threads per row. */
529:   row = blockIdx.x * blockDim.x + threadIdx.x;
530:   if (row < nrows) {
531:     slice_id     = row / SLICE_HEIGHT;
532:     row_in_slice = row % SLICE_HEIGHT;

534:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
535:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
536:     __syncthreads();
537:     if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
538:     __syncthreads();
539:     if (threadIdx.y < 1) {
540:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
541:       z[row] = y[row] + shared[threadIdx.x];
542:     }
543:   }
544: }

546: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
547: {
548:   __shared__ MatScalar shared[512];
549:   PetscInt             i, row, slice_id, row_in_slice;
550:   /* multiple threads per row. */
551:   row = blockIdx.x * blockDim.x + threadIdx.x;
552:   if (row < nrows) {
553:     slice_id     = row / SLICE_HEIGHT;
554:     row_in_slice = row % SLICE_HEIGHT;

556:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
557:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
558:     __syncthreads();
559:     if (threadIdx.y < 1) {
560:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
561:       z[row] = y[row] + shared[threadIdx.x];
562:     }
563:   }
564: }

566: static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
567: {
568:   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
569:   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
570:   PetscScalar       *y;
571:   const PetscScalar *x;
572:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
573:   MatScalar         *aval;
574:   PetscInt          *acolidx;
575:   PetscInt          *sliidx;
576:   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
577:   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
578: #if !defined(PETSC_USE_COMPLEX)
579:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
580:   PetscReal maxoveravg;
581: #endif

583:   PetscFunctionBegin;
584:   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
585:   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
586:   PetscCall(MatSeqSELLCUDACopyToGPU(A));
587:   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
588:   aval    = cudastruct->val;
589:   acolidx = cudastruct->colidx;
590:   sliidx  = cudastruct->sliidx;

592:   PetscCall(VecCUDAGetArrayRead(xx, &x));
593:   PetscCall(VecCUDAGetArrayWrite(yy, &y));
594:   PetscCall(PetscLogGpuTimeBegin());

596:   switch (cudastruct->kernelchoice) {
597: #if !defined(PETSC_USE_COMPLEX)
598:   case 9:
599:     nblocks = 1 + (nrows - 1) / sliceheight;
600:     if (cudastruct->blocky == 2) {
601:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
602:     } else if (cudastruct->blocky == 4) {
603:       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604:     } else if (cudastruct->blocky == 8) {
605:       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606:     } else if (cudastruct->blocky == 16) {
607:       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608:     } else if (cudastruct->blocky == 32) {
609:       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610:     } else {
611:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612:     }
613:     break;
614:   case 7:
615:     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
616:     if (cudastruct->blocky == 2) {
617:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
618:     } else if (cudastruct->blocky == 4) {
619:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620:     } else if (cudastruct->blocky == 8) {
621:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622:     } else if (cudastruct->blocky == 16) {
623:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624:     } else if (cudastruct->blocky == 32) {
625:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626:     } else {
627:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628:     }
629:     break;
630: #endif
631:   case 6:
632:     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
633:     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
634:     break;
635:   case 5:
636:     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
637:     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
638:     break;
639:   case 4:
640:     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
641:     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
642:     break;
643:   case 3:
644:     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
645:     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
646:     break;
647:   case 2: /* 16 slices per block if blocksize=512 */
648:     nblocks = 1 + (nrows - 1) / (blocksize / 2);
649:     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
650:     break;
651:   case 1: /* 32 slices per block if blocksize=512 */
652:     nblocks = 1 + (nrows - 1) / blocksize;
653:     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654:     break;
655: #if !defined(PETSC_USE_COMPLEX)
656:   case 0:
657:     maxoveravg = a->maxslicewidth / a->avgslicewidth;
658:     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659:       /* each block handles approximately one slice */
660:       PetscInt blocky = a->chunksize / 32;
661:       nchunks         = cudastruct->totalchunks;
662:       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
663:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
664:       chunk_slice_map = cudastruct->chunk_slice_map;
665:       if (blocky == 2) {
666:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667:       } else if (blocky == 4) {
668:         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669:       } else if (blocky == 8) {
670:         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671:       } else if (blocky == 16) {
672:         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673:       } else if (blocky == 32) {
674:         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675:       } else {
676:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677:       }
678:     } else {
679:       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680:       if (avgslicesize <= 432) {
681:         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682:           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683:           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684:         } else {
685:           nblocks = 1 + (nrows - 1) / sliceheight;
686:           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687:         }
688:       } else if (avgslicesize <= 2400) {
689:         nblocks = 1 + (nrows - 1) / sliceheight;
690:         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691:       } else {
692:         nblocks = 1 + (nrows - 1) / sliceheight;
693:         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694:       }
695:     }
696:     break;
697: #endif
698:   default:
699:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
700:   }
701:   PetscCall(PetscLogGpuTimeEnd());
702:   PetscCall(VecCUDARestoreArrayRead(xx, &x));
703:   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
704:   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705:   PetscFunctionReturn(PETSC_SUCCESS);
706: }

708: static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
709: {
710:   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
711:   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
712:   PetscScalar       *z;
713:   const PetscScalar *y, *x;
714:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
715:   MatScalar         *aval    = cudastruct->val;
716:   PetscInt          *acolidx = cudastruct->colidx;
717:   PetscInt          *sliidx  = cudastruct->sliidx;
718: #if !defined(PETSC_USE_COMPLEX)
719:   PetscReal maxoveravg;
720:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
721:   PetscInt  blocky = cudastruct->blocky;
722: #endif

724:   PetscFunctionBegin;
725:   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726:   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727:   PetscCall(MatSeqSELLCUDACopyToGPU(A));
728:   if (a->nz) {
729:     PetscInt nblocks, blocksize = 512;
730:     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731:     PetscCall(VecCUDAGetArrayRead(xx, &x));
732:     PetscCall(VecCUDAGetArrayRead(yy, &y));
733:     PetscCall(VecCUDAGetArrayWrite(zz, &z));
734:     PetscCall(PetscLogGpuTimeBegin());

736:     switch (cudastruct->kernelchoice) {
737: #if !defined(PETSC_USE_COMPLEX)
738:     case 9:
739:       nblocks = 1 + (nrows - 1) / sliceheight;
740:       if (blocky == 2) {
741:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742:       } else if (blocky == 4) {
743:         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744:       } else if (blocky == 8) {
745:         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746:       } else if (blocky == 16) {
747:         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748:       } else if (blocky == 32) {
749:         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750:       } else {
751:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752:       }
753:       break;
754:     case 8:
755:       /* each block handles approximately one slice */
756:       nchunks         = cudastruct->totalchunks;
757:       blocky          = a->chunksize / 32;
758:       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
759:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
760:       chunk_slice_map = cudastruct->chunk_slice_map;
761:       if (blocky == 2) {
762:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763:       } else if (blocky == 4) {
764:         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765:       } else if (blocky == 8) {
766:         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767:       } else if (blocky == 16) {
768:         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769:       } else if (blocky == 32) {
770:         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771:       } else {
772:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773:       }
774:       break;
775:     case 7:
776:       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
777:       if (blocky == 2) {
778:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779:       } else if (blocky == 4) {
780:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781:       } else if (blocky == 8) {
782:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783:       } else if (blocky == 16) {
784:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785:       } else if (blocky == 32) {
786:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787:       } else {
788:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789:       }
790:       break;
791: #endif
792:     case 6:
793:       nblocks = 1 + (nrows - 1) / (blocksize / 32);
794:       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
795:       break;
796:     case 5:
797:       nblocks = 1 + (nrows - 1) / (blocksize / 16);
798:       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
799:       break;
800:     case 4:
801:       nblocks = 1 + (nrows - 1) / (blocksize / 8);
802:       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
803:       break;
804:     case 3:
805:       nblocks = 1 + (nrows - 1) / (blocksize / 4);
806:       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
807:       break;
808:     case 2:
809:       nblocks = 1 + (nrows - 1) / (blocksize / 2);
810:       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
811:       break;
812:     case 1:
813:       nblocks = 1 + (nrows - 1) / blocksize;
814:       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
815:       break;
816: #if !defined(PETSC_USE_COMPLEX)
817:     case 0:
818:       maxoveravg = a->maxslicewidth / a->avgslicewidth;
819:       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
820:         /* each block handles approximately one slice */
821:         nchunks         = cudastruct->totalchunks;
822:         blocky          = a->chunksize / 32;
823:         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
824:         nblocks         = 1 + (nchunks - 1) / chunksperblock;
825:         chunk_slice_map = cudastruct->chunk_slice_map;
826:         if (blocky == 2) {
827:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
828:         } else if (blocky == 4) {
829:           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830:         } else if (blocky == 8) {
831:           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832:         } else if (blocky == 16) {
833:           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834:         } else if (blocky == 32) {
835:           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836:         } else {
837:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838:         }
839:       } else {
840:         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
841:         if (avgslicesize <= 432) {
842:           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
843:             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
844:             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845:           } else {
846:             nblocks = 1 + (nrows - 1) / sliceheight;
847:             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848:           }
849:         } else if (avgslicesize <= 2400) {
850:           nblocks = 1 + (nrows - 1) / sliceheight;
851:           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
852:         } else {
853:           nblocks = 1 + (nrows - 1) / sliceheight;
854:           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
855:         }
856:       }
857:       break;
858: #endif
859:     default:
860:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
861:     }
862:     PetscCall(PetscLogGpuTimeEnd());
863:     PetscCall(VecCUDARestoreArrayRead(xx, &x));
864:     PetscCall(VecCUDARestoreArrayRead(yy, &y));
865:     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
866:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
867:   } else {
868:     PetscCall(VecCopy(yy, zz));
869:   }
870:   PetscFunctionReturn(PETSC_SUCCESS);
871: }

873: static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject)
874: {
875:   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
876:   PetscInt         kernel, blocky;
877:   PetscBool        flg;

879:   PetscFunctionBegin;
880:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
881:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
882:   if (flg) {
883:     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
884:     cudastruct->blocky = blocky;
885:   }
886:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
887:   if (flg) {
888:     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
889:     cudastruct->kernelchoice = kernel;
890:     if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg));
891:   }
892:   PetscOptionsHeadEnd();
893:   PetscFunctionReturn(PETSC_SUCCESS);
894: }

896: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
897: {
898:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

900:   PetscFunctionBegin;
901:   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
902:   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
903:   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
904:   PetscFunctionReturn(PETSC_SUCCESS);
905: }

907: static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
908: {
909:   PetscFunctionBegin;
910:   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
911:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
912:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
913:   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A));
914:   A->ops->mult    = MatMult_SeqSELLCUDA;
915:   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
916:   PetscFunctionReturn(PETSC_SUCCESS);
917: }

919: static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
920: {
921:   PetscBool    both = PETSC_FALSE;
922:   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;

924:   PetscFunctionBegin;
925:   if (A->factortype == MAT_FACTOR_NONE) {
926:     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
927:     if (cudastruct->val) {
928:       both = PETSC_TRUE;
929:       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
930:     }
931:   }
932:   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
933:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
934:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
935:   else A->offloadmask = PETSC_OFFLOAD_CPU;
936:   PetscFunctionReturn(PETSC_SUCCESS);
937: }

939: static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
940: {
941:   PetscFunctionBegin;
942:   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
943:   PetscCall(MatDestroy_SeqSELL(A));
944:   PetscFunctionReturn(PETSC_SUCCESS);
945: }

947: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
948: static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
949: {
950:   PetscFunctionBegin;
951:   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
952:   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
953:   PetscFunctionReturn(PETSC_SUCCESS);
954: }

956: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
957: {
958:   Mat_SeqSELLCUDA *cudastruct;

960:   PetscFunctionBegin;
961:   PetscCall(PetscFree(B->defaultvectype));
962:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

964:   if (!B->spptr) {
965:     if (B->factortype == MAT_FACTOR_NONE) {
966:       PetscCall(PetscNew(&cudastruct));
967:       B->spptr = cudastruct;
968:     }
969:   }

971:   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
972:   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
973:   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
974:   B->ops->mult           = MatMult_SeqSELLCUDA;
975:   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
976:   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
977:   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;

979:   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
980:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));

982:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
983:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
984:   PetscFunctionReturn(PETSC_SUCCESS);
985: }

987: /*MC
988:   MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

990:   Options Database Keys:
991: +  -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
992: .  -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
993: -  -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32

995:   Level: beginner

997: .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
998: M*/

1000: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1001: {
1002:   PetscFunctionBegin;
1003:   PetscCall(MatCreate_SeqSELL(B));
1004:   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1005:   PetscCall(MatSetFromOptions(B));
1006:   PetscFunctionReturn(PETSC_SUCCESS);
1007: }