Actual source code: sellhip.hip.cpp
  1: #include "hip/hip_runtime.h"
  2: #include <hip/hip_runtime.h>

  4: #include <petscdevice_hip.h>
  5: #include <petsc/private/cupmatomics.hpp>
  6: #include <../src/mat/impls/sell/seq/sell.h>

  8: #define WARP_SIZE 64

 10: typedef struct {
 11:   PetscInt   maxallocmat;
 12:   PetscInt   totalentries;
 13:   PetscInt  *colidx; /* column index array, device pointer */
 14:   MatScalar *val;    /* value array, device pointer */
 15:   PetscInt   totalslices;
 16:   PetscInt  *sliidx; /* slice index array, device pointer */
 17:   PetscInt   nonzerostate;
 18:   PetscInt   kernelchoice;
 19:   PetscInt   blocky;
 20:   PetscInt   chunksperblock;
 21:   PetscInt   totalchunks;
 22:   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
 23: } Mat_SeqSELLHIP;

 25: static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
 26: {
 27:   PetscFunctionBegin;
 28:   if (*hipstruct) {
 29:     if ((*hipstruct)->colidx) { PetscCallHIP(hipFree((*hipstruct)->colidx)); }
 30:     if ((*hipstruct)->val) { PetscCallHIP(hipFree((*hipstruct)->val)); }
 31:     if ((*hipstruct)->sliidx) { PetscCallHIP(hipFree((*hipstruct)->sliidx)); }
 32:     if ((*hipstruct)->chunk_slice_map) { PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); }
 33:     PetscCall(PetscFree(*hipstruct));
 34:   }
 35:   PetscFunctionReturn(PETSC_SUCCESS);
 36: }

 38: static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
 39: {
 40:   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
 41:   Mat_SeqSELL    *a         = (Mat_SeqSELL *)A->data;

 43:   PetscFunctionBegin;
 44:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
 45:     PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
 46:     if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
 47:       /* copy values only */
 48:       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
 49:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
 50:     } else {
 51:       if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
 52:       if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
 53:       if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
 54:       if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
 55:       hipstruct->maxallocmat  = a->maxallocmat;
 56:       hipstruct->totalentries = a->sliidx[a->totalslices];
 57:       hipstruct->totalslices  = a->totalslices;
 58:       hipstruct->totalchunks  = a->totalchunks;
 59:       PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
 60:       PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
 61:       /* copy values, nz or maxallocmat? */
 62:       PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
 63:       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));

 65:       PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
 66:       PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
 67:       PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
 68:       PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
 69:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
 70:     }
 71:     PetscCallHIP(WaitForHIP());
 72:     PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
 73:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 74:   }
 75:   PetscFunctionReturn(PETSC_SUCCESS);
 76: }

 78: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
 79: {
 80:   PetscInt  i, row, slice_id, row_in_slice;
 81:   MatScalar sum;
 82:   /* one thread per row. */
 83:   row = blockIdx.x * blockDim.x + threadIdx.x;
 84:   if (row < nrows) {
 85:     slice_id     = row / sliceheight;
 86:     row_in_slice = row % sliceheight;
 87:     sum          = 0.0;
 88:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
 89:     y[row] = sum;
 90:   }
 91: }

 93: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
 94: {
 95:   PetscInt  i, row, slice_id, row_in_slice;
 96:   MatScalar sum;
 97:   /* one thread per row. */
 98:   row = blockIdx.x * blockDim.x + threadIdx.x;
 99:   if (row < nrows) {
100:     slice_id     = row / sliceheight;
101:     row_in_slice = row % sliceheight;
102:     sum          = 0.0;
103:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104:     z[row] = y[row] + sum;
105:   }
106: }

108: #if !defined(PETSC_USE_COMPLEX)
109: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed")
110: /* use 1 block per slice, suitable for large slice width */
111: template <int BLOCKY>
112: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
113: {
114:   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
115:   PetscInt             i, row, slice_id = blockIdx.x;
116:   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
117:   /* transposed index */
118:   int         tidx = tid % BLOCKY;
119:   int         tidy = tid / BLOCKY;
120:   PetscScalar t    = 0.0;

122:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
123:   if (row < nrows) {
124:     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
125:   }
126:   #pragma unroll
127:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
128:   /* transpose layout to reduce each row using warp shfl */
129:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
130:   __syncthreads();
131:   if (tidy < sliceheight) t = shared[tidy][tidx];
132:   #pragma unroll
133:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
134:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
135:   __syncthreads();
136:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
137: }

139: /* use 1 block per slice, suitable for large slice width */
140: template <int BLOCKY>
141: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
142: {
143:   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
144:   PetscInt             i, row, slice_id = blockIdx.x;
145:   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
146:   /* transposed index */
147:   int         tidx = tid % BLOCKY;
148:   int         tidy = tid / BLOCKY;
149:   PetscScalar t    = 0.0;

151:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
152:   if (row < nrows) {
153:     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
154:   }
155:   #pragma unroll
156:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
157:   /* transpose layout to reduce each row using warp shfl */
158:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
159:   __syncthreads();
160:   if (tidy < sliceheight) t = shared[tidy][tidx];
161:   #pragma unroll
162:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
163:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
164:   __syncthreads();
165:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
166: }

168: template <int BLOCKY>
169: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
170: {
171:   bool head = true;
172:   #pragma unroll
173:   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
174:     int halfwarpid                                = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
175:     shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
176:     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
177:       shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
178:       if (i == 1) head = false;
179:     }
180:     __syncthreads();
181:     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
182:     __syncthreads();
183:   }
184:   return head;
185: }

187: /* load-balancing version. Chunksize is equal to the number of threads per block */
188: template <int BLOCKY>
189: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
190: {
191:   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
192:   PetscInt             gid, row, start_slice, cid;
193:   PetscScalar          t = 0.0;
194:   AtomicAdd<MatScalar> atomAdd;
195:   /* zero out y */
196:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
197:     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
198:     if (gid < nrows) y[gid] = 0.0;
199:   }
200:   for (int iter = 0; iter < chunksperblock; iter++) {
201:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
202:     if (cid < totalchunks) {
203:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
204:       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
205:       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
206:         __shared__ PetscInt flag[BLOCKY * 2];
207:         bool                write;
208:         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
209:         /* find out the slice that this element belongs to */
210:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
211:         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
212:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
213:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
214:         __syncthreads();
215:         write = segment_scan<BLOCKY>(flag, shared, &t);
216:         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
217:         t = 0.0;
218:       } else { /* this iteration covers only one slice */
219:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
220:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
221:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
222:           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
223:   /* reduction and write to output vector */
224:   #pragma unroll
225:           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
226:           /* transpose layout to reduce each row using warp shfl */
227:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
228:           __syncthreads();
229:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
230:   #pragma unroll
231:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
232:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
233:           __syncthreads();
234:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
235:           t = 0.0;
236:         }
237:       }
238:     }
239:   }
240: }

242: /* load-balancing version. Chunksize is equal to the number of threads per block */
243: template <int BLOCKY>
244: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
245: {
246:   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
247:   PetscInt             gid, row, start_slice, cid;
248:   PetscScalar          t = 0.0;
249:   AtomicAdd<MatScalar> atomAdd;
250:   /* copy y to z */
251:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
252:     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
253:     if (gid < nrows) z[gid] = y[gid];
254:   }
255:   for (int iter = 0; iter < chunksperblock; iter++) {
256:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
257:     if (cid < totalchunks) {
258:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
259:       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
260:       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
261:         __shared__ PetscInt flag[BLOCKY * 2];
262:         bool                write;
263:         PetscInt            slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
264:         /* find out the slice that this element belongs to */
265:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
266:         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
267:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
268:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
269:         __syncthreads();
270:         write = segment_scan<BLOCKY>(flag, shared, &t);
271:         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
272:         t = 0.0;
273:       } else { /* this iteration covers only one slice */
274:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
275:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
276:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
277:           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
278:   /* reduction and write to output vector */
279:   #pragma unroll
280:           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
281:           /* transpose layout to reduce each row using warp shfl */
282:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
283:           __syncthreads();
284:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
285:   #pragma unroll
286:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
287:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
288:           __syncthreads();
289:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
290:           t = 0.0;
291:         }
292:       }
293:     }
294:   }
295: }

297: /* use 1 warp per slice, suitable for small slice width */
298: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
299: {
300:   PetscInt i, row, slice_id;
301:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
302:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
303:   double t = 0.0;
304:   if (row < nrows) {
305:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
306:   }
307:   #pragma unroll
308:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
309:   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
310: }

312: /* use 1 warp per slice, suitable for small slice width */
313: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314: {
315:   PetscInt i, row, slice_id;
316:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
317:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
318:   double t = 0.0;
319:   if (row < nrows) {
320:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
321:   }
322:   #pragma unroll
323:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
324:   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
325: }
326: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
327: #endif

329: /***********  Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison  **********/

331: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
332: {
333:   __shared__ MatScalar shared[32 * 16];
334:   PetscInt             i, row, slice_id, row_in_slice;
335:   /* multiple threads per row. */
336:   row = blockIdx.x * blockDim.x + threadIdx.x;
337:   if (row < nrows) {
338:     slice_id     = row / sliceheight;
339:     row_in_slice = row % sliceheight;

341:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343:     __syncthreads();
344:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
345:     __syncthreads();
346:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
347:     __syncthreads();
348:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
349:     __syncthreads();
350:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
351:     __syncthreads();
352:     if (threadIdx.y < 1) {
353:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354:       y[row] = shared[threadIdx.x];
355:     }
356:   }
357: }

359: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
360: {
361:   __shared__ MatScalar shared[32 * 16];
362:   PetscInt             i, row, slice_id, row_in_slice;
363:   /* multiple threads per row. */
364:   row = blockIdx.x * blockDim.x + threadIdx.x;
365:   if (row < nrows) {
366:     slice_id     = row / sliceheight;
367:     row_in_slice = row % sliceheight;

369:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371:     __syncthreads();
372:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
373:     __syncthreads();
374:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
375:     __syncthreads();
376:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
377:     __syncthreads();
378:     if (threadIdx.y < 1) {
379:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
380:       y[row] = shared[threadIdx.x];
381:     }
382:   }
383: }

385: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
386: {
387:   __shared__ MatScalar shared[32 * 16];
388:   PetscInt             i, row, slice_id, row_in_slice;
389:   /* multiple threads per row. */
390:   row = blockIdx.x * blockDim.x + threadIdx.x;
391:   if (row < nrows) {
392:     slice_id     = row / sliceheight;
393:     row_in_slice = row % sliceheight;

395:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
396:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
397:     __syncthreads();
398:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
399:     __syncthreads();
400:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
401:     __syncthreads();
402:     if (threadIdx.y < 1) {
403:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
404:       y[row] = shared[threadIdx.x];
405:     }
406:   }
407: }

409: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
410: {
411:   __shared__ MatScalar shared[32 * 16];
412:   PetscInt             i, row, slice_id, row_in_slice;
413:   /* multiple threads per row. */
414:   row = blockIdx.x * blockDim.x + threadIdx.x;
415:   if (row < nrows) {
416:     slice_id     = row / sliceheight;
417:     row_in_slice = row % sliceheight;

419:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
420:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
421:     __syncthreads();
422:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
423:     __syncthreads();
424:     if (threadIdx.y < 1) {
425:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
426:       y[row] = shared[threadIdx.x];
427:     }
428:   }
429: }

431: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
432: {
433:   __shared__ MatScalar shared[32 * 16];
434:   PetscInt             i, row, slice_id, row_in_slice;
435:   /* multiple threads per row. */
436:   row = blockIdx.x * blockDim.x + threadIdx.x;
437:   if (row < nrows) {
438:     slice_id     = row / sliceheight;
439:     row_in_slice = row % sliceheight;

441:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
442:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
443:     __syncthreads();
444:     if (threadIdx.y < 1) {
445:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
446:       y[row] = shared[threadIdx.x];
447:     }
448:   }
449: }

451: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
452: {
453:   __shared__ MatScalar shared[32 * 16];
454:   PetscInt             i, row, slice_id, row_in_slice;
455:   /* multiple threads per row. */
456:   row = blockIdx.x * blockDim.x + threadIdx.x;
457:   if (row < nrows) {
458:     slice_id     = row / sliceheight;
459:     row_in_slice = row % sliceheight;

461:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
462:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
463:     __syncthreads();
464:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
465:     __syncthreads();
466:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
467:     __syncthreads();
468:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
469:     __syncthreads();
470:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
471:     __syncthreads();
472:     if (threadIdx.y < 1) {
473:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
474:       z[row] = y[row] + shared[threadIdx.x];
475:     }
476:   }
477: }

479: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
480: {
481:   __shared__ MatScalar shared[32 * 16];
482:   PetscInt             i, row, slice_id, row_in_slice;
483:   /* multiple threads per row. */
484:   row = blockIdx.x * blockDim.x + threadIdx.x;
485:   if (row < nrows) {
486:     slice_id     = row / sliceheight;
487:     row_in_slice = row % sliceheight;

489:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
490:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
491:     __syncthreads();
492:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
493:     __syncthreads();
494:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
495:     __syncthreads();
496:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
497:     __syncthreads();
498:     if (threadIdx.y < 1) {
499:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
500:       z[row] = y[row] + shared[threadIdx.x];
501:     }
502:   }
503: }

505: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
506: {
507:   __shared__ MatScalar shared[32 * 16];
508:   PetscInt             i, row, slice_id, row_in_slice;
509:   /* multiple threads per row. */
510:   row = blockIdx.x * blockDim.x + threadIdx.x;
511:   if (row < nrows) {
512:     slice_id     = row / sliceheight;
513:     row_in_slice = row % sliceheight;

515:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
516:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
517:     __syncthreads();
518:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
519:     __syncthreads();
520:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
521:     __syncthreads();
522:     if (threadIdx.y < 1) {
523:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
524:       z[row] = y[row] + shared[threadIdx.x];
525:     }
526:   }
527: }

529: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
530: {
531:   __shared__ MatScalar shared[32 * 16];
532:   PetscInt             i, row, slice_id, row_in_slice;
533:   /* multiple threads per row. */
534:   row = blockIdx.x * blockDim.x + threadIdx.x;
535:   if (row < nrows) {
536:     slice_id     = row / sliceheight;
537:     row_in_slice = row % sliceheight;

539:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
540:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
541:     __syncthreads();
542:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
543:     __syncthreads();
544:     if (threadIdx.y < 1) {
545:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
546:       z[row] = y[row] + shared[threadIdx.x];
547:     }
548:   }
549: }

551: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
552: {
553:   __shared__ MatScalar shared[32 * 16];
554:   PetscInt             i, row, slice_id, row_in_slice;
555:   /* multiple threads per row. */
556:   row = blockIdx.x * blockDim.x + threadIdx.x;
557:   if (row < nrows) {
558:     slice_id     = row / sliceheight;
559:     row_in_slice = row % sliceheight;

561:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
562:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
563:     __syncthreads();
564:     if (threadIdx.y < 1) {
565:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
566:       z[row] = y[row] + shared[threadIdx.x];
567:     }
568:   }
569: }

571: static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
572: {
573:   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
574:   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
575:   PetscScalar       *y;
576:   const PetscScalar *x;
577:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
578:   MatScalar         *aval;
579:   PetscInt          *acolidx;
580:   PetscInt          *sliidx;
581:   PetscInt           nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
582:   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
583: #if !defined(PETSC_USE_COMPLEX)
584:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
585:   PetscReal maxoveravg;
586: #endif

588:   PetscFunctionBegin;
589:   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
590:   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
591:   PetscCall(MatSeqSELLHIPCopyToGPU(A));
592:   /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
593:   aval    = hipstruct->val;
594:   acolidx = hipstruct->colidx;
595:   sliidx  = hipstruct->sliidx;

597:   PetscCall(VecHIPGetArrayRead(xx, &x));
598:   PetscCall(VecHIPGetArrayWrite(yy, &y));
599:   PetscCall(PetscLogGpuTimeBegin());

601:   switch (hipstruct->kernelchoice) {
602: #if !defined(PETSC_USE_COMPLEX)
603:   case 9: /* 1 slice per block */
604:     nblocks = 1 + (nrows - 1) / sliceheight;
605:     if (hipstruct->blocky == 2) {
606:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607:     } else if (hipstruct->blocky == 4) {
608:       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609:     } else if (hipstruct->blocky == 8) {
610:       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611:     } else if (hipstruct->blocky == 16) {
612:       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613:     } else {
614:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615:     }
616:     break;
617:   case 7: /* each block handles blocky slices */
618:     nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
619:     if (hipstruct->blocky == 2) {
620:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621:     } else if (hipstruct->blocky == 4) {
622:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623:     } else if (hipstruct->blocky == 8) {
624:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625:     } else if (hipstruct->blocky == 16) {
626:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
627:     } else {
628:       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
629:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630:     }
631:     break;
632: #endif
633:   case 6:
634:     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
635:     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
636:     break;
637:   case 5:
638:     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
639:     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
640:     break;
641:   case 4:
642:     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
643:     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
644:     break;
645:   case 3:
646:     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
647:     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
648:     break;
649:   case 2: /* 16 slices per block if sliceheight=32 */
650:     nblocks = 1 + (nrows - 1) / (blocksize / 2);
651:     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
652:     break;
653:   case 1: /* 32 slices per block if sliceheight=32 */
654:     nblocks = 1 + (nrows - 1) / blocksize;
655:     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656:     break;
657: #if !defined(PETSC_USE_COMPLEX)
658:   case 0:
659:     maxoveravg = a->maxslicewidth / a->avgslicewidth;
660:     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661:       /* each block handles approximately one slice */
662:       PetscInt blocky = a->chunksize / 32;
663:       nchunks         = hipstruct->totalchunks;
664:       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
665:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
666:       chunk_slice_map = hipstruct->chunk_slice_map;
667:       if (blocky == 2) {
668:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669:       } else if (blocky == 4) {
670:         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671:       } else if (blocky == 8) {
672:         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673:       } else if (blocky == 16) {
674:         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675:       } else {
676:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677:       }
678:     } else {
679:       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680:       if (avgslicesize <= 432) {
681:         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682:           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683:           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684:         } else {
685:           nblocks = 1 + (nrows - 1) / sliceheight;
686:           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687:         }
688:       } else if (avgslicesize <= 2400) {
689:         nblocks = 1 + (nrows - 1) / sliceheight;
690:         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691:       } else {
692:         nblocks = 1 + (nrows - 1) / sliceheight;
693:         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694:       }
695:     }
696:     break;
697: #endif
698:   default:
699:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
700:   }
701:   PetscCall(PetscLogGpuTimeEnd());
702:   PetscCall(VecHIPRestoreArrayRead(xx, &x));
703:   PetscCall(VecHIPRestoreArrayWrite(yy, &y));
704:   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705:   PetscFunctionReturn(PETSC_SUCCESS);
706: }

708: static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
709: {
710:   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
711:   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
712:   PetscScalar       *z;
713:   const PetscScalar *y, *x;
714:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
715:   MatScalar         *aval    = hipstruct->val;
716:   PetscInt          *acolidx = hipstruct->colidx;
717:   PetscInt          *sliidx  = hipstruct->sliidx;
718: #if !defined(PETSC_USE_COMPLEX)
719:   PetscReal maxoveravg;
720:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
721:   PetscInt  blocky = hipstruct->blocky;
722: #endif

724:   PetscFunctionBegin;
725:   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726:   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727:   PetscCall(MatSeqSELLHIPCopyToGPU(A));
728:   if (a->nz) {
729:     PetscInt nblocks, blocksize = 512;
730:     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731:     PetscCall(VecHIPGetArrayRead(xx, &x));
732:     PetscCall(VecHIPGetArrayRead(yy, &y));
733:     PetscCall(VecHIPGetArrayWrite(zz, &z));
734:     PetscCall(PetscLogGpuTimeBegin());

736:     switch (hipstruct->kernelchoice) {
737: #if !defined(PETSC_USE_COMPLEX)
738:     case 9:
739:       nblocks = 1 + (nrows - 1) / sliceheight;
740:       if (blocky == 2) {
741:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742:       } else if (blocky == 4) {
743:         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744:       } else if (blocky == 8) {
745:         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746:       } else if (blocky == 16) {
747:         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748:       } else {
749:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750:       }
751:       break;
752:     case 8:
753:       /* each block handles approximately one slice */
754:       nchunks         = hipstruct->totalchunks;
755:       blocky          = a->chunksize / 32;
756:       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
757:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
758:       chunk_slice_map = hipstruct->chunk_slice_map;
759:       if (blocky == 2) {
760:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761:       } else if (blocky == 4) {
762:         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763:       } else if (blocky == 8) {
764:         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765:       } else if (blocky == 16) {
766:         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767:       } else {
768:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769:       }
770:       break;
771:     case 7:
772:       nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
773:       if (blocky == 2) {
774:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775:       } else if (blocky == 4) {
776:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777:       } else if (blocky == 8) {
778:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779:       } else if (blocky == 16) {
780:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781:       } else {
782:         nblocks = 1 + (nrows - 1) / (2 * sliceheight);
783:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
784:       }
785:       break;
786: #endif
787:     case 6:
788:       nblocks = 1 + (nrows - 1) / (blocksize / 32);
789:       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
790:       break;
791:     case 5:
792:       nblocks = 1 + (nrows - 1) / (blocksize / 16);
793:       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
794:       break;
795:     case 4:
796:       nblocks = 1 + (nrows - 1) / (blocksize / 8);
797:       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
798:       break;
799:     case 3:
800:       nblocks = 1 + (nrows - 1) / (blocksize / 4);
801:       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
802:       break;
803:     case 2:
804:       nblocks = 1 + (nrows - 1) / (blocksize / 2);
805:       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
806:       break;
807:     case 1:
808:       nblocks = 1 + (nrows - 1) / blocksize;
809:       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
810:       break;
811: #if !defined(PETSC_USE_COMPLEX)
812:     case 0:
813:       maxoveravg = a->maxslicewidth / a->avgslicewidth;
814:       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
815:         /* each block handles approximately one slice */
816:         nchunks         = hipstruct->totalchunks;
817:         blocky          = a->chunksize / 32;
818:         chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
819:         nblocks         = 1 + (nchunks - 1) / chunksperblock;
820:         chunk_slice_map = hipstruct->chunk_slice_map;
821:         if (blocky == 2) {
822:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823:         } else if (blocky == 4) {
824:           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825:         } else if (blocky == 8) {
826:           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827:         } else if (blocky == 16) {
828:           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829:         } else {
830:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
831:         }
832:       } else {
833:         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
834:         if (avgslicesize <= 432) {
835:           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
836:             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
837:             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838:           } else {
839:             nblocks = 1 + (nrows - 1) / sliceheight;
840:             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
841:           }
842:         } else if (avgslicesize <= 2400) {
843:           nblocks = 1 + (nrows - 1) / sliceheight;
844:           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845:         } else {
846:           nblocks = 1 + (nrows - 1) / sliceheight;
847:           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848:         }
849:       }
850:       break;
851: #endif
852:     default:
853:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
854:     }
855:     PetscCall(PetscLogGpuTimeEnd());
856:     PetscCall(VecHIPRestoreArrayRead(xx, &x));
857:     PetscCall(VecHIPRestoreArrayRead(yy, &y));
858:     PetscCall(VecHIPRestoreArrayWrite(zz, &z));
859:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
860:   } else {
861:     PetscCall(VecCopy(yy, zz));
862:   }
863:   PetscFunctionReturn(PETSC_SUCCESS);
864: }

866: static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems *PetscOptionsObject)
867: {
868:   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
869:   PetscInt        kernel, blocky;
870:   PetscBool       flg;

872:   PetscFunctionBegin;
873:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
874:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
875:   if (flg) {
876:     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
877:     hipstruct->blocky = blocky;
878:   }
879:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
880:   if (flg) {
881:     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
882:     hipstruct->kernelchoice = kernel;
883:     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); }
884:   }
885:   PetscOptionsHeadEnd();
886:   PetscFunctionReturn(PETSC_SUCCESS);
887: }

889: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
890: {
891:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

893:   PetscFunctionBegin;
894:   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
895:   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
896:   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
897:   PetscFunctionReturn(PETSC_SUCCESS);
898: }

900: static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
901: {
902:   PetscFunctionBegin;
903:   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
904:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
905:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
906:   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLHIPCopyToGPU(A)); }
907:   A->ops->mult    = MatMult_SeqSELLHIP;
908:   A->ops->multadd = MatMultAdd_SeqSELLHIP;
909:   PetscFunctionReturn(PETSC_SUCCESS);
910: }

912: static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
913: {
914:   PetscBool    both = PETSC_FALSE;
915:   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;

917:   PetscFunctionBegin;
918:   if (A->factortype == MAT_FACTOR_NONE) {
919:     Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
920:     if (hipstruct->val) {
921:       both = PETSC_TRUE;
922:       PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
923:     }
924:   }
925:   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
926:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
927:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
928:   else A->offloadmask = PETSC_OFFLOAD_CPU;
929:   PetscFunctionReturn(PETSC_SUCCESS);
930: }

932: static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
933: {
934:   PetscFunctionBegin;
935:   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
936:   PetscCall(MatDestroy_SeqSELL(A));
937:   PetscFunctionReturn(PETSC_SUCCESS);
938: }

940: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
941: static PetscErrorCode       MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
942: {
943:   PetscFunctionBegin;
944:   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
945:   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
946:   PetscFunctionReturn(PETSC_SUCCESS);
947: }

949: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
950: {
951:   Mat_SeqSELLHIP *hipstruct;

953:   PetscFunctionBegin;
954:   PetscCall(PetscFree(B->defaultvectype));
955:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));

957:   if (!B->spptr) {
958:     if (B->factortype == MAT_FACTOR_NONE) {
959:       PetscCall(PetscNew(&hipstruct));
960:       B->spptr = hipstruct;
961:     }
962:   }

964:   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLHIP;
965:   B->ops->destroy        = MatDestroy_SeqSELLHIP;
966:   B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
967:   B->ops->mult           = MatMult_SeqSELLHIP;
968:   B->ops->multadd        = MatMultAdd_SeqSELLHIP;
969:   B->ops->duplicate      = MatDuplicate_SeqSELLHIP;
970:   B->ops->zeroentries    = MatZeroEntries_SeqSELLHIP;

972:   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
973:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));

975:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
976:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
977:   PetscFunctionReturn(PETSC_SUCCESS);
978: }

980: /*MC
981:   MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.

983:   Options Database Keys:
984: +  -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
985: .  -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
986: -  -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)

988:   Level: beginner

990: .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
991: M*/

993: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
994: {
995:   PetscFunctionBegin;
996:   PetscCall(MatCreate_SeqSELL(B));
997:   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
998:   PetscCall(MatSetFromOptions(B));
999:   PetscFunctionReturn(PETSC_SUCCESS);
1000: }