Actual source code: sellcuda.cu

  1: #include <cuda_runtime.h>

  3: #include <petscdevice_cuda.h>
  4: #include <petsc/private/cupmatomics.hpp>
  5: #include <../src/mat/impls/sell/seq/sell.h>

  7: #define SLICE_HEIGHT 16

  9: typedef struct {
 10:   PetscInt   maxallocmat;
 11:   PetscInt   totalentries;
 12:   PetscInt  *colidx; /* column index array, device pointer */
 13:   MatScalar *val;    /* value array, device pointer */
 14:   PetscInt   totalslices;
 15:   PetscInt  *sliidx; /* slice index array, device pointer */
 16:   PetscInt   nonzerostate;
 17:   PetscInt   kernelchoice;
 18:   PetscInt   blocky;
 19:   PetscInt   chunksperblock;
 20:   PetscInt   totalchunks;
 21:   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
 22: } Mat_SeqSELLCUDA;

 24: static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
 25: {
 26:   PetscFunctionBegin;
 27:   if (*cudastruct) {
 28:     if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
 29:     if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
 30:     if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
 31:     if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
 32:     PetscCall(PetscFree(*cudastruct));
 33:   }
 34:   PetscFunctionReturn(PETSC_SUCCESS);
 35: }

 37: static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
 38: {
 39:   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
 40:   Mat_SeqSELL     *a          = (Mat_SeqSELL *)A->data;

 42:   PetscFunctionBegin;
 43:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
 44:     PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
 45:     if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
 46:       /* copy values only */
 47:       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
 48:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
 49:     } else {
 50:       if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
 51:       if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
 52:       if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
 53:       if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
 54:       cudastruct->maxallocmat  = a->maxallocmat;
 55:       cudastruct->totalentries = a->sliidx[a->totalslices];
 56:       cudastruct->totalslices  = a->totalslices;
 57:       cudastruct->totalchunks  = a->totalchunks;
 58:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
 59:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
 60:       /* copy values, nz or maxallocmat? */
 61:       PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
 62:       PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));

 64:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
 65:       PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
 66:       PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
 67:       PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
 68:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
 69:     }
 70:     PetscCallCUDA(WaitForCUDA());
 71:     PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
 72:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 73:   }
 74:   PetscFunctionReturn(PETSC_SUCCESS);
 75: }

 77: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
 78: {
 79:   PetscInt  i, row, slice_id, row_in_slice;
 80:   MatScalar sum;
 81:   /* one thread per row. */
 82:   row = blockIdx.x * blockDim.x + threadIdx.x;
 83:   if (row < nrows) {
 84:     slice_id     = row / sliceheight;
 85:     row_in_slice = row % sliceheight;
 86:     sum          = 0.0;
 87:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
 88:     y[row] = sum;
 89:   }
 90: }

 92: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
 93: {
 94:   PetscInt  i, row, slice_id, row_in_slice;
 95:   MatScalar sum;
 96:   /* one thread per row. */
 97:   row = blockIdx.x * blockDim.x + threadIdx.x;
 98:   if (row < nrows) {
 99:     slice_id     = row / sliceheight;
100:     row_in_slice = row % sliceheight;
101:     sum          = 0.0;
102:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
103:     z[row] = y[row] + sum;
104:   }
105: }

107: #if !defined(PETSC_USE_COMPLEX)
108: /* use 1 block per slice, suitable for large slice width */
109: template <int BLOCKY>
110: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
111: {
112:   __shared__ MatScalar shared[32][BLOCKY];
113:   PetscInt             i, row, slice_id = blockIdx.x;
114:   int                  tid = threadIdx.x + threadIdx.y * 32;
115:   /* transposed index */
116:   int         tidx = tid % BLOCKY;
117:   int         tidy = tid / BLOCKY;
118:   PetscScalar t    = 0.0;

120:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
121:   if (row < nrows) {
122:     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
123:   }
124:   #pragma unroll
125:   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
126:   /* transpose layout to reduce each row using warp shfl */
127:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
128:   __syncthreads();
129:   if (tidy < sliceheight) t = shared[tidy][tidx];
130:   #pragma unroll
131:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
132:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
133:   __syncthreads();
134:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
135: }

137: /* use 1 block per slice, suitable for large slice width */
138: template <int BLOCKY>
139: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
140: {
141:   __shared__ MatScalar shared[32][BLOCKY];
142:   PetscInt             i, row, slice_id = blockIdx.x;
143:   int                  tid = threadIdx.x + threadIdx.y * 32;
144:   /* transposed index */
145:   int         tidx = tid % BLOCKY;
146:   int         tidy = tid / BLOCKY;
147:   PetscScalar t    = 0.0;

149:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
150:   if (row < nrows) {
151:     for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
152:   }
153:   #pragma unroll
154:   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
155:   /* transpose layout to reduce each row using warp shfl */
156:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
157:   __syncthreads();
158:   if (tidy < sliceheight) t = shared[tidy][tidx];
159:   #pragma unroll
160:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
161:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
162:   __syncthreads();
163:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
164: }

166: template <int BLOCKY>
167: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
168: {
169:   bool head = true;
170:   #pragma unroll
171:   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
172:     int halfwarpid                         = threadIdx.y * 2 + threadIdx.x / 16;
173:     shared[threadIdx.x + threadIdx.y * 32] = 0;
174:     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
175:       shared[threadIdx.x + threadIdx.y * 32] = *val;
176:       if (i == 1) head = false;
177:     }
178:     __syncthreads();
179:     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
180:     __syncthreads();
181:   }
182:   return head;
183: }

185: /* load-balancing version. Chunksize is equal to the number of threads per block */
186: template <int BLOCKY>
187: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
188: {
189:   __shared__ MatScalar shared[BLOCKY * 32];
190:   PetscInt             gid, row, start_slice, cid;
191:   PetscScalar          t = 0.0;
192:   AtomicAdd<MatScalar> atomAdd;
193:   /* zero out y */
194:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
195:     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
196:     if (gid < nrows) y[gid] = 0.0;
197:   }
198:   for (int iter = 0; iter < chunksperblock; iter++) {
199:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
200:     if (cid < totalchunks) {
201:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
202:       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
203:       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
204:         __shared__ PetscInt flag[BLOCKY * 2];
205:         bool                write;
206:         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
207:         /* find out the slice that this element belongs to */
208:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
209:         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
210:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
211:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
212:         __syncthreads();
213:         write = segment_scan<BLOCKY>(flag, shared, &t);
214:         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
215:         t = 0.0;
216:       } else { /* this iteration covers only one slice */
217:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
218:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
219:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
220:           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
221:   /* reduction and write to output vector */
222:   #pragma unroll
223:           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
224:           /* transpose layout to reduce each row using warp shfl */
225:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
226:           __syncthreads();
227:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
228:   #pragma unroll
229:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
230:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
231:           __syncthreads();
232:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
233:           t = 0.0;
234:         }
235:       }
236:     }
237:   }
238: }

240: /* load-balancing version. Chunksize is equal to the number of threads per block */
241: template <int BLOCKY>
242: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
243: {
244:   __shared__ MatScalar shared[BLOCKY * 32];
245:   PetscInt             gid, row, start_slice, cid;
246:   PetscScalar          t = 0.0;
247:   AtomicAdd<MatScalar> atomAdd;
248:   /* copy y to z */
249:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
250:     gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
251:     if (gid < nrows) z[gid] = y[gid];
252:   }
253:   for (int iter = 0; iter < chunksperblock; iter++) {
254:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
255:     if (cid < totalchunks) {
256:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
257:       gid         = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
258:       if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
259:         __shared__ PetscInt flag[BLOCKY * 2];
260:         bool                write;
261:         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
262:         /* find out the slice that this element belongs to */
263:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
264:         if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
265:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
266:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
267:         __syncthreads();
268:         write = segment_scan<BLOCKY>(flag, shared, &t);
269:         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
270:         t = 0.0;
271:       } else { /* this iteration covers only one slice */
272:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
273:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
274:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
275:           int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
276:   /* reduction and write to output vector */
277:   #pragma unroll
278:           for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
279:           /* transpose layout to reduce each row using warp shfl */
280:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
281:           __syncthreads();
282:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
283:   #pragma unroll
284:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
285:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
286:           __syncthreads();
287:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
288:           t = 0.0;
289:         }
290:       }
291:     }
292:   }
293: }

295: /* use 1 warp per slice, suitable for small slice width */
296: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
297: {
298:   PetscInt i, row, slice_id;
299:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
300:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
301:   double t = 0.0;
302:   if (row < nrows) {
303:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
304:   }
305:   #pragma unroll
306:   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
307:   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
308: }

310: /* use 1 warp per slice, suitable for small slice width */
311: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
312: {
313:   PetscInt i, row, slice_id;
314:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
315:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
316:   double t = 0.0;
317:   if (row < nrows) {
318:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
319:   }
320:   #pragma unroll
321:   for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
322:   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
323: }
324: #endif

326: /***********  Kernel 2-6  are tied to slice height 16. They are kept only for performance comparison  **********/

328: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
329: {
330:   __shared__ MatScalar shared[512];
331:   PetscInt             i, row, slice_id, row_in_slice;
332:   /* multiple threads per row. */
333:   row = blockIdx.x * blockDim.x + threadIdx.x;
334:   if (row < nrows) {
335:     slice_id     = row / SLICE_HEIGHT;
336:     row_in_slice = row % SLICE_HEIGHT;

338:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
339:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
340:     __syncthreads();
341:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
342:     __syncthreads();
343:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
344:     __syncthreads();
345:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
346:     __syncthreads();
347:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
348:     __syncthreads();
349:     if (threadIdx.y < 1) {
350:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
351:       y[row] = shared[threadIdx.x];
352:     }
353:   }
354: }

356: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
357: {
358:   __shared__ MatScalar shared[512];
359:   PetscInt             i, row, slice_id, row_in_slice;
360:   /* multiple threads per row. */
361:   row = blockIdx.x * blockDim.x + threadIdx.x;
362:   if (row < nrows) {
363:     slice_id     = row / SLICE_HEIGHT;
364:     row_in_slice = row % SLICE_HEIGHT;

366:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
367:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
368:     __syncthreads();
369:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
370:     __syncthreads();
371:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
372:     __syncthreads();
373:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
374:     __syncthreads();
375:     if (threadIdx.y < 1) {
376:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
377:       y[row] = shared[threadIdx.x];
378:     }
379:   }
380: }

382: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
383: {
384:   __shared__ MatScalar shared[512];
385:   PetscInt             i, row, slice_id, row_in_slice;
386:   /* multiple threads per row. */
387:   row = blockIdx.x * blockDim.x + threadIdx.x;
388:   if (row < nrows) {
389:     slice_id     = row / SLICE_HEIGHT;
390:     row_in_slice = row % SLICE_HEIGHT;

392:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
393:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
394:     __syncthreads();
395:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
396:     __syncthreads();
397:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
398:     __syncthreads();
399:     if (threadIdx.y < 1) {
400:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
401:       y[row] = shared[threadIdx.x];
402:     }
403:   }
404: }

406: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
407: {
408:   __shared__ MatScalar shared[512];
409:   PetscInt             i, row, slice_id, row_in_slice;
410:   /* multiple threads per row. */
411:   row = blockIdx.x * blockDim.x + threadIdx.x;
412:   if (row < nrows) {
413:     slice_id     = row / SLICE_HEIGHT;
414:     row_in_slice = row % SLICE_HEIGHT;

416:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
417:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
418:     __syncthreads();
419:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
420:     __syncthreads();
421:     if (threadIdx.y < 1) {
422:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
423:       y[row] = shared[threadIdx.x];
424:     }
425:   }
426: }

428: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
429: {
430:   __shared__ MatScalar shared[512];
431:   PetscInt             i, row, slice_id, row_in_slice;
432:   /* multiple threads per row. */
433:   row = blockIdx.x * blockDim.x + threadIdx.x;
434:   if (row < nrows) {
435:     slice_id     = row / SLICE_HEIGHT;
436:     row_in_slice = row % SLICE_HEIGHT;

438:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
439:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
440:     __syncthreads();
441:     if (threadIdx.y < 1) {
442:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
443:       y[row] = shared[threadIdx.x];
444:     }
445:   }
446: }

448: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
449: {
450:   __shared__ MatScalar shared[512];
451:   PetscInt             i, row, slice_id, row_in_slice;
452:   /* multiple threads per row. */
453:   row = blockIdx.x * blockDim.x + threadIdx.x;
454:   if (row < nrows) {
455:     slice_id     = row / SLICE_HEIGHT;
456:     row_in_slice = row % SLICE_HEIGHT;

458:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
459:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
460:     __syncthreads();
461:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
462:     __syncthreads();
463:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
464:     __syncthreads();
465:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
466:     __syncthreads();
467:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
468:     __syncthreads();
469:     if (threadIdx.y < 1) {
470:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
471:       z[row] = y[row] + shared[threadIdx.x];
472:     }
473:   }
474: }

476: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
477: {
478:   __shared__ MatScalar shared[512];
479:   PetscInt             i, row, slice_id, row_in_slice;
480:   /* multiple threads per row. */
481:   row = blockIdx.x * blockDim.x + threadIdx.x;
482:   if (row < nrows) {
483:     slice_id     = row / SLICE_HEIGHT;
484:     row_in_slice = row % SLICE_HEIGHT;

486:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
487:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
488:     __syncthreads();
489:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
490:     __syncthreads();
491:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
492:     __syncthreads();
493:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
494:     __syncthreads();
495:     if (threadIdx.y < 1) {
496:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
497:       z[row] = y[row] + shared[threadIdx.x];
498:     }
499:   }
500: }

502: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
503: {
504:   __shared__ MatScalar shared[512];
505:   PetscInt             i, row, slice_id, row_in_slice;
506:   /* multiple threads per row. */
507:   row = blockIdx.x * blockDim.x + threadIdx.x;
508:   if (row < nrows) {
509:     slice_id     = row / SLICE_HEIGHT;
510:     row_in_slice = row % SLICE_HEIGHT;

512:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
513:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
514:     __syncthreads();
515:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
516:     __syncthreads();
517:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
518:     __syncthreads();
519:     if (threadIdx.y < 1) {
520:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
521:       z[row] = y[row] + shared[threadIdx.x];
522:     }
523:   }
524: }

526: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
527: {
528:   __shared__ MatScalar shared[512];
529:   PetscInt             i, row, slice_id, row_in_slice;
530:   /* multiple threads per row. */
531:   row = blockIdx.x * blockDim.x + threadIdx.x;
532:   if (row < nrows) {
533:     slice_id     = row / SLICE_HEIGHT;
534:     row_in_slice = row % SLICE_HEIGHT;

536:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
537:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
538:     __syncthreads();
539:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
540:     __syncthreads();
541:     if (threadIdx.y < 1) {
542:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
543:       z[row] = y[row] + shared[threadIdx.x];
544:     }
545:   }
546: }

548: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
549: {
550:   __shared__ MatScalar shared[512];
551:   PetscInt             i, row, slice_id, row_in_slice;
552:   /* multiple threads per row. */
553:   row = blockIdx.x * blockDim.x + threadIdx.x;
554:   if (row < nrows) {
555:     slice_id     = row / SLICE_HEIGHT;
556:     row_in_slice = row % SLICE_HEIGHT;

558:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
559:     for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
560:     __syncthreads();
561:     if (threadIdx.y < 1) {
562:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
563:       z[row] = y[row] + shared[threadIdx.x];
564:     }
565:   }
566: }

568: static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
569: {
570:   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
571:   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
572:   PetscScalar       *y;
573:   const PetscScalar *x;
574:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
575:   MatScalar         *aval;
576:   PetscInt          *acolidx;
577:   PetscInt          *sliidx;
578:   PetscInt           nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
579:   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
580: #if !defined(PETSC_USE_COMPLEX)
581:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
582:   PetscReal maxoveravg;
583: #endif

585:   PetscFunctionBegin;
586:   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
587:   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
588:   PetscCall(MatSeqSELLCUDACopyToGPU(A));
589:   /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
590:   aval    = cudastruct->val;
591:   acolidx = cudastruct->colidx;
592:   sliidx  = cudastruct->sliidx;

594:   PetscCall(VecCUDAGetArrayRead(xx, &x));
595:   PetscCall(VecCUDAGetArrayWrite(yy, &y));
596:   PetscCall(PetscLogGpuTimeBegin());

598:   switch (cudastruct->kernelchoice) {
599: #if !defined(PETSC_USE_COMPLEX)
600:   case 9:
601:     nblocks = 1 + (nrows - 1) / sliceheight;
602:     if (cudastruct->blocky == 2) {
603:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604:     } else if (cudastruct->blocky == 4) {
605:       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606:     } else if (cudastruct->blocky == 8) {
607:       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608:     } else if (cudastruct->blocky == 16) {
609:       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610:     } else if (cudastruct->blocky == 32) {
611:       matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612:     } else {
613:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
614:     }
615:     break;
616:   case 7:
617:     nblocks = 1 + (nrows - 1) / (2 * sliceheight);
618:     if (cudastruct->blocky == 2) {
619:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620:     } else if (cudastruct->blocky == 4) {
621:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622:     } else if (cudastruct->blocky == 8) {
623:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624:     } else if (cudastruct->blocky == 16) {
625:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626:     } else if (cudastruct->blocky == 32) {
627:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628:     } else {
629:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630:     }
631:     break;
632: #endif
633:   case 6:
634:     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
635:     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
636:     break;
637:   case 5:
638:     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
639:     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
640:     break;
641:   case 4:
642:     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
643:     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
644:     break;
645:   case 3:
646:     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
647:     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
648:     break;
649:   case 2: /* 16 slices per block if blocksize=512 */
650:     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651:     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
652:     break;
653:   case 1: /* 32 slices per block if blocksize=512 */
654:     nblocks = 1 + (nrows - 1) / blocksize;
655:     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656:     break;
657: #if !defined(PETSC_USE_COMPLEX)
658:   case 0:
659:     maxoveravg = a->maxslicewidth / a->avgslicewidth;
660:     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661:       /* each block handles approximately one slice */
662:       PetscInt blocky = a->chunksize / 32;
663:       nchunks         = cudastruct->totalchunks;
664:       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
665:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
666:       chunk_slice_map = cudastruct->chunk_slice_map;
667:       if (blocky == 2) {
668:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669:       } else if (blocky == 4) {
670:         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671:       } else if (blocky == 8) {
672:         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673:       } else if (blocky == 16) {
674:         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675:       } else if (blocky == 32) {
676:         matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677:       } else {
678:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
679:       }
680:     } else {
681:       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
682:       if (avgslicesize <= 432) {
683:         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
684:           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
685:           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
686:         } else {
687:           nblocks = 1 + (nrows - 1) / sliceheight;
688:           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
689:         }
690:       } else if (avgslicesize <= 2400) {
691:         nblocks = 1 + (nrows - 1) / sliceheight;
692:         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
693:       } else {
694:         nblocks = 1 + (nrows - 1) / sliceheight;
695:         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
696:       }
697:     }
698:     break;
699: #endif
700:   default:
701:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
702:   }
703:   PetscCall(PetscLogGpuTimeEnd());
704:   PetscCall(VecCUDARestoreArrayRead(xx, &x));
705:   PetscCall(VecCUDARestoreArrayWrite(yy, &y));
706:   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
707:   PetscFunctionReturn(PETSC_SUCCESS);
708: }

710: static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
711: {
712:   Mat_SeqSELL       *a          = (Mat_SeqSELL *)A->data;
713:   Mat_SeqSELLCUDA   *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
714:   PetscScalar       *z;
715:   const PetscScalar *y, *x;
716:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
717:   MatScalar         *aval    = cudastruct->val;
718:   PetscInt          *acolidx = cudastruct->colidx;
719:   PetscInt          *sliidx  = cudastruct->sliidx;
720: #if !defined(PETSC_USE_COMPLEX)
721:   PetscReal maxoveravg;
722:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
723:   PetscInt  blocky = cudastruct->blocky;
724: #endif

726:   PetscFunctionBegin;
727:   PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
728:   PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
729:   PetscCall(MatSeqSELLCUDACopyToGPU(A));
730:   if (a->nz) {
731:     PetscInt nblocks, blocksize = 512;
732:     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
733:     PetscCall(VecCUDAGetArrayRead(xx, &x));
734:     PetscCall(VecCUDAGetArrayRead(yy, &y));
735:     PetscCall(VecCUDAGetArrayWrite(zz, &z));
736:     PetscCall(PetscLogGpuTimeBegin());

738:     switch (cudastruct->kernelchoice) {
739: #if !defined(PETSC_USE_COMPLEX)
740:     case 9:
741:       nblocks = 1 + (nrows - 1) / sliceheight;
742:       if (blocky == 2) {
743:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744:       } else if (blocky == 4) {
745:         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746:       } else if (blocky == 8) {
747:         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748:       } else if (blocky == 16) {
749:         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750:       } else if (blocky == 32) {
751:         matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752:       } else {
753:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
754:       }
755:       break;
756:     case 8:
757:       /* each block handles approximately one slice */
758:       nchunks         = cudastruct->totalchunks;
759:       blocky          = a->chunksize / 32;
760:       chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
761:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
762:       chunk_slice_map = cudastruct->chunk_slice_map;
763:       if (blocky == 2) {
764:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765:       } else if (blocky == 4) {
766:         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767:       } else if (blocky == 8) {
768:         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769:       } else if (blocky == 16) {
770:         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771:       } else if (blocky == 32) {
772:         matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773:       } else {
774:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
775:       }
776:       break;
777:     case 7:
778:       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
779:       if (blocky == 2) {
780:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781:       } else if (blocky == 4) {
782:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783:       } else if (blocky == 8) {
784:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785:       } else if (blocky == 16) {
786:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787:       } else if (blocky == 32) {
788:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789:       } else {
790:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
791:       }
792:       break;
793: #endif
794:     case 6:
795:       nblocks = 1 + (nrows - 1) / (blocksize / 32);
796:       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
797:       break;
798:     case 5:
799:       nblocks = 1 + (nrows - 1) / (blocksize / 16);
800:       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
801:       break;
802:     case 4:
803:       nblocks = 1 + (nrows - 1) / (blocksize / 8);
804:       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
805:       break;
806:     case 3:
807:       nblocks = 1 + (nrows - 1) / (blocksize / 4);
808:       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
809:       break;
810:     case 2:
811:       nblocks = 1 + (nrows - 1) / (blocksize / 2);
812:       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
813:       break;
814:     case 1:
815:       nblocks = 1 + (nrows - 1) / blocksize;
816:       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
817:       break;
818: #if !defined(PETSC_USE_COMPLEX)
819:     case 0:
820:       maxoveravg = a->maxslicewidth / a->avgslicewidth;
821:       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
822:         /* each block handles approximately one slice */
823:         nchunks         = cudastruct->totalchunks;
824:         blocky          = a->chunksize / 32;
825:         chunksperblock  = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
826:         nblocks         = 1 + (nchunks - 1) / chunksperblock;
827:         chunk_slice_map = cudastruct->chunk_slice_map;
828:         if (blocky == 2) {
829:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830:         } else if (blocky == 4) {
831:           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832:         } else if (blocky == 8) {
833:           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834:         } else if (blocky == 16) {
835:           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836:         } else if (blocky == 32) {
837:           matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838:         } else {
839:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
840:         }
841:       } else {
842:         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
843:         if (avgslicesize <= 432) {
844:           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
845:             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
846:             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
847:           } else {
848:             nblocks = 1 + (nrows - 1) / sliceheight;
849:             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
850:           }
851:         } else if (avgslicesize <= 2400) {
852:           nblocks = 1 + (nrows - 1) / sliceheight;
853:           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
854:         } else {
855:           nblocks = 1 + (nrows - 1) / sliceheight;
856:           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
857:         }
858:       }
859:       break;
860: #endif
861:     default:
862:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
863:     }
864:     PetscCall(PetscLogGpuTimeEnd());
865:     PetscCall(VecCUDARestoreArrayRead(xx, &x));
866:     PetscCall(VecCUDARestoreArrayRead(yy, &y));
867:     PetscCall(VecCUDARestoreArrayWrite(zz, &z));
868:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
869:   } else {
870:     PetscCall(VecCopy(yy, zz));
871:   }
872:   PetscFunctionReturn(PETSC_SUCCESS);
873: }

875: static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
876: {
877:   Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
878:   PetscInt         kernel, blocky;
879:   PetscBool        flg;

881:   PetscFunctionBegin;
882:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
883:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
884:   if (flg) {
885:     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
886:     cudastruct->blocky = blocky;
887:   }
888:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
889:   if (flg) {
890:     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
891:     cudastruct->kernelchoice = kernel;
892:     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
893:   }
894:   PetscOptionsHeadEnd();
895:   PetscFunctionReturn(PETSC_SUCCESS);
896: }

898: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
899: {
900:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

902:   PetscFunctionBegin;
903:   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
904:   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
905:   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
906:   PetscFunctionReturn(PETSC_SUCCESS);
907: }

909: static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
910: {
911:   PetscFunctionBegin;
912:   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
913:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
914:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
915:   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
916:   A->ops->mult    = MatMult_SeqSELLCUDA;
917:   A->ops->multadd = MatMultAdd_SeqSELLCUDA;
918:   PetscFunctionReturn(PETSC_SUCCESS);
919: }

921: static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
922: {
923:   PetscBool    both = PETSC_FALSE;
924:   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;

926:   PetscFunctionBegin;
927:   if (A->factortype == MAT_FACTOR_NONE) {
928:     Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
929:     if (cudastruct->val) {
930:       both = PETSC_TRUE;
931:       PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
932:     }
933:   }
934:   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
935:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
936:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
937:   else A->offloadmask = PETSC_OFFLOAD_CPU;
938:   PetscFunctionReturn(PETSC_SUCCESS);
939: }

941: static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
942: {
943:   PetscFunctionBegin;
944:   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
945:   PetscCall(MatDestroy_SeqSELL(A));
946:   PetscFunctionReturn(PETSC_SUCCESS);
947: }

949: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
950: static PetscErrorCode       MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
951: {
952:   PetscFunctionBegin;
953:   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
954:   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
955:   PetscFunctionReturn(PETSC_SUCCESS);
956: }

958: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
959: {
960:   Mat_SeqSELLCUDA *cudastruct;

962:   PetscFunctionBegin;
963:   PetscCall(PetscFree(B->defaultvectype));
964:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

966:   if (!B->spptr) {
967:     if (B->factortype == MAT_FACTOR_NONE) {
968:       PetscCall(PetscNew(&cudastruct));
969:       B->spptr = cudastruct;
970:     }
971:   }

973:   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLCUDA;
974:   B->ops->destroy        = MatDestroy_SeqSELLCUDA;
975:   B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
976:   B->ops->mult           = MatMult_SeqSELLCUDA;
977:   B->ops->multadd        = MatMultAdd_SeqSELLCUDA;
978:   B->ops->duplicate      = MatDuplicate_SeqSELLCUDA;
979:   B->ops->zeroentries    = MatZeroEntries_SeqSELLCUDA;

981:   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
982:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));

984:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
985:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
986:   PetscFunctionReturn(PETSC_SUCCESS);
987: }

989: /*MC
990:   MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

992:   Options Database Keys:
993: +  -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
994: .  -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
995: -  -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32

997:   Level: beginner

999: .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
1000: M*/

1002: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1003: {
1004:   PetscFunctionBegin;
1005:   PetscCall(MatCreate_SeqSELL(B));
1006:   PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1007:   PetscCall(MatSetFromOptions(B));
1008:   PetscFunctionReturn(PETSC_SUCCESS);
1009: }