Actual source code: sellhip.hip.cpp

  1: #include "hip/hip_runtime.h"
  2: #include <hip/hip_runtime.h>

  4: #include <petscdevice_hip.h>
  5: #include <petsc/private/cupmatomics.hpp>
  6: #include <../src/mat/impls/sell/seq/sell.h>

  8: #define WARP_SIZE 64

 10: typedef struct {
 11:   PetscInt   maxallocmat;
 12:   PetscInt   totalentries;
 13:   PetscInt  *colidx; /* column index array, device pointer */
 14:   MatScalar *val;    /* value array, device pointer */
 15:   PetscInt   totalslices;
 16:   PetscInt  *sliidx; /* slice index array, device pointer */
 17:   PetscInt   nonzerostate;
 18:   PetscInt   kernelchoice;
 19:   PetscInt   blocky;
 20:   PetscInt   chunksperblock;
 21:   PetscInt   totalchunks;
 22:   PetscInt  *chunk_slice_map; /* starting slice for each chunk, device pointer */
 23: } Mat_SeqSELLHIP;

 25: static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
 26: {
 27:   PetscFunctionBegin;
 28:   if (*hipstruct) {
 29:     if ((*hipstruct)->colidx) { PetscCallHIP(hipFree((*hipstruct)->colidx)); }
 30:     if ((*hipstruct)->val) { PetscCallHIP(hipFree((*hipstruct)->val)); }
 31:     if ((*hipstruct)->sliidx) { PetscCallHIP(hipFree((*hipstruct)->sliidx)); }
 32:     if ((*hipstruct)->chunk_slice_map) { PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); }
 33:     PetscCall(PetscFree(*hipstruct));
 34:   }
 35:   PetscFunctionReturn(PETSC_SUCCESS);
 36: }

 38: static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
 39: {
 40:   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
 41:   Mat_SeqSELL    *a         = (Mat_SeqSELL *)A->data;

 43:   PetscFunctionBegin;
 44:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
 45:     PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
 46:     if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
 47:       /* copy values only */
 48:       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
 49:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
 50:     } else {
 51:       if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
 52:       if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
 53:       if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
 54:       if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
 55:       hipstruct->maxallocmat  = a->maxallocmat;
 56:       hipstruct->totalentries = a->sliidx[a->totalslices];
 57:       hipstruct->totalslices  = a->totalslices;
 58:       hipstruct->totalchunks  = a->totalchunks;
 59:       PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
 60:       PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
 61:       /* copy values, nz or maxallocmat? */
 62:       PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
 63:       PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));

 65:       PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
 66:       PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
 67:       PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
 68:       PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
 69:       PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
 70:     }
 71:     PetscCallHIP(WaitForHIP());
 72:     PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
 73:     A->offloadmask = PETSC_OFFLOAD_BOTH;
 74:   }
 75:   PetscFunctionReturn(PETSC_SUCCESS);
 76: }

 78: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
 79: {
 80:   PetscInt  i, row, slice_id, row_in_slice;
 81:   MatScalar sum;
 82:   /* one thread per row. */
 83:   row = blockIdx.x * blockDim.x + threadIdx.x;
 84:   if (row < nrows) {
 85:     slice_id     = row / sliceheight;
 86:     row_in_slice = row % sliceheight;
 87:     sum          = 0.0;
 88:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
 89:     y[row] = sum;
 90:   }
 91: }

 93: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
 94: {
 95:   PetscInt  i, row, slice_id, row_in_slice;
 96:   MatScalar sum;
 97:   /* one thread per row. */
 98:   row = blockIdx.x * blockDim.x + threadIdx.x;
 99:   if (row < nrows) {
100:     slice_id     = row / sliceheight;
101:     row_in_slice = row % sliceheight;
102:     sum          = 0.0;
103:     for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104:     z[row] = y[row] + sum;
105:   }
106: }

108: #if !defined(PETSC_USE_COMPLEX)
109: /* use 1 block per slice, suitable for large slice width */
110: template <int BLOCKY>
111: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
112: {
113:   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
114:   PetscInt             i, row, slice_id = blockIdx.x;
115:   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
116:   /* transposed index */
117:   int         tidx = tid % BLOCKY;
118:   int         tidy = tid / BLOCKY;
119:   PetscScalar t    = 0.0;

121:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
122:   if (row < nrows) {
123:     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
124:   }
125:   #pragma unroll
126:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
127:   /* transpose layout to reduce each row using warp shfl */
128:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
129:   __syncthreads();
130:   if (tidy < sliceheight) t = shared[tidy][tidx];
131:   #pragma unroll
132:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
133:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
134:   __syncthreads();
135:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
136: }

138: /* use 1 block per slice, suitable for large slice width */
139: template <int BLOCKY>
140: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
141: {
142:   __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
143:   PetscInt             i, row, slice_id = blockIdx.x;
144:   int                  tid = threadIdx.x + threadIdx.y * WARP_SIZE;
145:   /* transposed index */
146:   int         tidx = tid % BLOCKY;
147:   int         tidy = tid / BLOCKY;
148:   PetscScalar t    = 0.0;

150:   row = slice_id * sliceheight + threadIdx.x % sliceheight;
151:   if (row < nrows) {
152:     for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
153:   }
154:   #pragma unroll
155:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
156:   /* transpose layout to reduce each row using warp shfl */
157:   if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
158:   __syncthreads();
159:   if (tidy < sliceheight) t = shared[tidy][tidx];
160:   #pragma unroll
161:   for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
162:   if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
163:   __syncthreads();
164:   if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
165: }

167: template <int BLOCKY>
168: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
169: {
170:   bool head = true;
171:   #pragma unroll
172:   for (int i = 1; i < BLOCKY * 2; i <<= 1) {
173:     int halfwarpid                                = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
174:     shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
175:     if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
176:       shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
177:       if (i == 1) head = false;
178:     }
179:     __syncthreads();
180:     if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
181:     __syncthreads();
182:   }
183:   return head;
184: }

186: /* load-balancing version. Chunksize is equal to the number of threads per block */
187: template <int BLOCKY>
188: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
189: {
190:   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
191:   PetscInt             gid, row, start_slice, cid;
192:   PetscScalar          t = 0.0;
193:   AtomicAdd<MatScalar> atomAdd;
194:   /* zero out y */
195:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
196:     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
197:     if (gid < nrows) y[gid] = 0.0;
198:   }
199:   for (int iter = 0; iter < chunksperblock; iter++) {
200:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
201:     if (cid < totalchunks) {
202:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
203:       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
204:       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
205:         __shared__ PetscInt flag[BLOCKY * 2];
206:         bool                write;
207:         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
208:         /* find out the slice that this element belongs to */
209:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
210:         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
211:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
212:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
213:         __syncthreads();
214:         write = segment_scan<BLOCKY>(flag, shared, &t);
215:         if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
216:         t = 0.0;
217:       } else { /* this iteration covers only one slice */
218:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
219:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
220:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
221:           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
222:   /* reduction and write to output vector */
223:   #pragma unroll
224:           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
225:           /* transpose layout to reduce each row using warp shfl */
226:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
227:           __syncthreads();
228:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
229:   #pragma unroll
230:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
231:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
232:           __syncthreads();
233:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
234:           t = 0.0;
235:         }
236:       }
237:     }
238:   }
239: }

241: /* load-balancing version. Chunksize is equal to the number of threads per block */
242: template <int BLOCKY>
243: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
244: {
245:   __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
246:   PetscInt             gid, row, start_slice, cid;
247:   PetscScalar          t = 0.0;
248:   AtomicAdd<MatScalar> atomAdd;
249:   /* copy y to z */
250:   for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
251:     gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
252:     if (gid < nrows) z[gid] = y[gid];
253:   }
254:   for (int iter = 0; iter < chunksperblock; iter++) {
255:     cid = blockIdx.x * chunksperblock + iter; /* chunk id */
256:     if (cid < totalchunks) {
257:       start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
258:       gid         = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
259:       if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
260:         __shared__ PetscInt flag[BLOCKY * 2];
261:         bool                write;
262:         PetscInt            slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
263:         /* find out the slice that this element belongs to */
264:         while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
265:         if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
266:         row = slice_id * sliceheight + threadIdx.x % sliceheight;
267:         if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
268:         __syncthreads();
269:         write = segment_scan<BLOCKY>(flag, shared, &t);
270:         if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
271:         t = 0.0;
272:       } else { /* this iteration covers only one slice */
273:         row = start_slice * sliceheight + threadIdx.x % sliceheight;
274:         if (row < nrows) t += aval[gid] * x[acolidx[gid]];
275:         if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
276:           int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
277:   /* reduction and write to output vector */
278:   #pragma unroll
279:           for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
280:           /* transpose layout to reduce each row using warp shfl */
281:           if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
282:           __syncthreads();
283:           if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
284:   #pragma unroll
285:           for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
286:           if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
287:           __syncthreads();
288:           if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
289:           t = 0.0;
290:         }
291:       }
292:     }
293:   }
294: }

296: /* use 1 warp per slice, suitable for small slice width */
297: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
298: {
299:   PetscInt i, row, slice_id;
300:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
301:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
302:   double t = 0.0;
303:   if (row < nrows) {
304:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
305:   }
306:   #pragma unroll
307:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
308:   if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
309: }

311: /* use 1 warp per slice, suitable for small slice width */
312: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
313: {
314:   PetscInt i, row, slice_id;
315:   slice_id = blockIdx.x * blockDim.y + threadIdx.y;
316:   row      = slice_id * sliceheight + threadIdx.x % sliceheight;
317:   double t = 0.0;
318:   if (row < nrows) {
319:     for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
320:   }
321:   #pragma unroll
322:   for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
323:   if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
324: }
325: #endif

327: /***********  Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison  **********/

329: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
330: {
331:   __shared__ MatScalar shared[32 * 16];
332:   PetscInt             i, row, slice_id, row_in_slice;
333:   /* multiple threads per row. */
334:   row = blockIdx.x * blockDim.x + threadIdx.x;
335:   if (row < nrows) {
336:     slice_id     = row / sliceheight;
337:     row_in_slice = row % sliceheight;

339:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
340:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
341:     __syncthreads();
342:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
343:     __syncthreads();
344:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
345:     __syncthreads();
346:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
347:     __syncthreads();
348:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
349:     __syncthreads();
350:     if (threadIdx.y < 1) {
351:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
352:       y[row] = shared[threadIdx.x];
353:     }
354:   }
355: }

357: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
358: {
359:   __shared__ MatScalar shared[32 * 16];
360:   PetscInt             i, row, slice_id, row_in_slice;
361:   /* multiple threads per row. */
362:   row = blockIdx.x * blockDim.x + threadIdx.x;
363:   if (row < nrows) {
364:     slice_id     = row / sliceheight;
365:     row_in_slice = row % sliceheight;

367:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
368:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
369:     __syncthreads();
370:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
371:     __syncthreads();
372:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
373:     __syncthreads();
374:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
375:     __syncthreads();
376:     if (threadIdx.y < 1) {
377:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
378:       y[row] = shared[threadIdx.x];
379:     }
380:   }
381: }

383: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
384: {
385:   __shared__ MatScalar shared[32 * 16];
386:   PetscInt             i, row, slice_id, row_in_slice;
387:   /* multiple threads per row. */
388:   row = blockIdx.x * blockDim.x + threadIdx.x;
389:   if (row < nrows) {
390:     slice_id     = row / sliceheight;
391:     row_in_slice = row % sliceheight;

393:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
394:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
395:     __syncthreads();
396:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
397:     __syncthreads();
398:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
399:     __syncthreads();
400:     if (threadIdx.y < 1) {
401:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
402:       y[row] = shared[threadIdx.x];
403:     }
404:   }
405: }

407: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
408: {
409:   __shared__ MatScalar shared[32 * 16];
410:   PetscInt             i, row, slice_id, row_in_slice;
411:   /* multiple threads per row. */
412:   row = blockIdx.x * blockDim.x + threadIdx.x;
413:   if (row < nrows) {
414:     slice_id     = row / sliceheight;
415:     row_in_slice = row % sliceheight;

417:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
418:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
419:     __syncthreads();
420:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
421:     __syncthreads();
422:     if (threadIdx.y < 1) {
423:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
424:       y[row] = shared[threadIdx.x];
425:     }
426:   }
427: }

429: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
430: {
431:   __shared__ MatScalar shared[32 * 16];
432:   PetscInt             i, row, slice_id, row_in_slice;
433:   /* multiple threads per row. */
434:   row = blockIdx.x * blockDim.x + threadIdx.x;
435:   if (row < nrows) {
436:     slice_id     = row / sliceheight;
437:     row_in_slice = row % sliceheight;

439:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
440:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
441:     __syncthreads();
442:     if (threadIdx.y < 1) {
443:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
444:       y[row] = shared[threadIdx.x];
445:     }
446:   }
447: }

449: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
450: {
451:   __shared__ MatScalar shared[32 * 16];
452:   PetscInt             i, row, slice_id, row_in_slice;
453:   /* multiple threads per row. */
454:   row = blockIdx.x * blockDim.x + threadIdx.x;
455:   if (row < nrows) {
456:     slice_id     = row / sliceheight;
457:     row_in_slice = row % sliceheight;

459:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
460:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
461:     __syncthreads();
462:     if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
463:     __syncthreads();
464:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
465:     __syncthreads();
466:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
467:     __syncthreads();
468:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
469:     __syncthreads();
470:     if (threadIdx.y < 1) {
471:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
472:       z[row] = y[row] + shared[threadIdx.x];
473:     }
474:   }
475: }

477: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
478: {
479:   __shared__ MatScalar shared[32 * 16];
480:   PetscInt             i, row, slice_id, row_in_slice;
481:   /* multiple threads per row. */
482:   row = blockIdx.x * blockDim.x + threadIdx.x;
483:   if (row < nrows) {
484:     slice_id     = row / sliceheight;
485:     row_in_slice = row % sliceheight;

487:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
488:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
489:     __syncthreads();
490:     if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
491:     __syncthreads();
492:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
493:     __syncthreads();
494:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
495:     __syncthreads();
496:     if (threadIdx.y < 1) {
497:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
498:       z[row] = y[row] + shared[threadIdx.x];
499:     }
500:   }
501: }

503: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
504: {
505:   __shared__ MatScalar shared[32 * 16];
506:   PetscInt             i, row, slice_id, row_in_slice;
507:   /* multiple threads per row. */
508:   row = blockIdx.x * blockDim.x + threadIdx.x;
509:   if (row < nrows) {
510:     slice_id     = row / sliceheight;
511:     row_in_slice = row % sliceheight;

513:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
514:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
515:     __syncthreads();
516:     if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
517:     __syncthreads();
518:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
519:     __syncthreads();
520:     if (threadIdx.y < 1) {
521:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
522:       z[row] = y[row] + shared[threadIdx.x];
523:     }
524:   }
525: }

527: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
528: {
529:   __shared__ MatScalar shared[32 * 16];
530:   PetscInt             i, row, slice_id, row_in_slice;
531:   /* multiple threads per row. */
532:   row = blockIdx.x * blockDim.x + threadIdx.x;
533:   if (row < nrows) {
534:     slice_id     = row / sliceheight;
535:     row_in_slice = row % sliceheight;

537:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
538:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
539:     __syncthreads();
540:     if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
541:     __syncthreads();
542:     if (threadIdx.y < 1) {
543:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
544:       z[row] = y[row] + shared[threadIdx.x];
545:     }
546:   }
547: }

549: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
550: {
551:   __shared__ MatScalar shared[32 * 16];
552:   PetscInt             i, row, slice_id, row_in_slice;
553:   /* multiple threads per row. */
554:   row = blockIdx.x * blockDim.x + threadIdx.x;
555:   if (row < nrows) {
556:     slice_id     = row / sliceheight;
557:     row_in_slice = row % sliceheight;

559:     shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
560:     for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
561:     __syncthreads();
562:     if (threadIdx.y < 1) {
563:       shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
564:       z[row] = y[row] + shared[threadIdx.x];
565:     }
566:   }
567: }

569: static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
570: {
571:   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
572:   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
573:   PetscScalar       *y;
574:   const PetscScalar *x;
575:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
576:   MatScalar         *aval;
577:   PetscInt          *acolidx;
578:   PetscInt          *sliidx;
579:   PetscInt           nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
580:   dim3               block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
581: #if !defined(PETSC_USE_COMPLEX)
582:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
583:   PetscReal maxoveravg;
584: #endif

586:   PetscFunctionBegin;
587:   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
588:   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
589:   PetscCall(MatSeqSELLHIPCopyToGPU(A));
590:   /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
591:   aval    = hipstruct->val;
592:   acolidx = hipstruct->colidx;
593:   sliidx  = hipstruct->sliidx;

595:   PetscCall(VecHIPGetArrayRead(xx, &x));
596:   PetscCall(VecHIPGetArrayWrite(yy, &y));
597:   PetscCall(PetscLogGpuTimeBegin());

599:   switch (hipstruct->kernelchoice) {
600: #if !defined(PETSC_USE_COMPLEX)
601:   case 9: /* 1 slice per block */
602:     nblocks = 1 + (nrows - 1) / sliceheight;
603:     if (hipstruct->blocky == 2) {
604:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
605:     } else if (hipstruct->blocky == 4) {
606:       matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607:     } else if (hipstruct->blocky == 8) {
608:       matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609:     } else if (hipstruct->blocky == 16) {
610:       matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611:     } else {
612:       matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613:     }
614:     break;
615:   case 7: /* each block handles blocky slices */
616:     nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
617:     if (hipstruct->blocky == 2) {
618:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
619:     } else if (hipstruct->blocky == 4) {
620:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621:     } else if (hipstruct->blocky == 8) {
622:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623:     } else if (hipstruct->blocky == 16) {
624:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625:     } else {
626:       nblocks = 1 + (nrows - 1) / (2 * sliceheight);
627:       matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628:     }
629:     break;
630: #endif
631:   case 6:
632:     nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
633:     matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
634:     break;
635:   case 5:
636:     nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
637:     matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
638:     break;
639:   case 4:
640:     nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
641:     matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
642:     break;
643:   case 3:
644:     nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
645:     matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
646:     break;
647:   case 2: /* 16 slices per block if sliceheight=32 */
648:     nblocks = 1 + (nrows - 1) / (blocksize / 2);
649:     matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
650:     break;
651:   case 1: /* 32 slices per block if sliceheight=32 */
652:     nblocks = 1 + (nrows - 1) / blocksize;
653:     matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654:     break;
655: #if !defined(PETSC_USE_COMPLEX)
656:   case 0:
657:     maxoveravg = a->maxslicewidth / a->avgslicewidth;
658:     if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659:       /* each block handles approximately one slice */
660:       PetscInt blocky = a->chunksize / 32;
661:       nchunks         = hipstruct->totalchunks;
662:       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
663:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
664:       chunk_slice_map = hipstruct->chunk_slice_map;
665:       if (blocky == 2) {
666:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667:       } else if (blocky == 4) {
668:         matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669:       } else if (blocky == 8) {
670:         matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671:       } else if (blocky == 16) {
672:         matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673:       } else {
674:         matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675:       }
676:     } else {
677:       PetscInt avgslicesize = sliceheight * a->avgslicewidth;
678:       if (avgslicesize <= 432) {
679:         if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
680:           nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
681:           matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
682:         } else {
683:           nblocks = 1 + (nrows - 1) / sliceheight;
684:           matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
685:         }
686:       } else if (avgslicesize <= 2400) {
687:         nblocks = 1 + (nrows - 1) / sliceheight;
688:         matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
689:       } else {
690:         nblocks = 1 + (nrows - 1) / sliceheight;
691:         matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
692:       }
693:     }
694:     break;
695: #endif
696:   default:
697:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
698:   }
699:   PetscCall(PetscLogGpuTimeEnd());
700:   PetscCall(VecHIPRestoreArrayRead(xx, &x));
701:   PetscCall(VecHIPRestoreArrayWrite(yy, &y));
702:   PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
703:   PetscFunctionReturn(PETSC_SUCCESS);
704: }

706: static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
707: {
708:   Mat_SeqSELL       *a         = (Mat_SeqSELL *)A->data;
709:   Mat_SeqSELLHIP    *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
710:   PetscScalar       *z;
711:   const PetscScalar *y, *x;
712:   PetscInt           nrows = A->rmap->n, sliceheight = a->sliceheight;
713:   MatScalar         *aval    = hipstruct->val;
714:   PetscInt          *acolidx = hipstruct->colidx;
715:   PetscInt          *sliidx  = hipstruct->sliidx;
716: #if !defined(PETSC_USE_COMPLEX)
717:   PetscReal maxoveravg;
718:   PetscInt  chunksperblock, nchunks, *chunk_slice_map;
719:   PetscInt  blocky = hipstruct->blocky;
720: #endif

722:   PetscFunctionBegin;
723:   PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
724:   PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
725:   PetscCall(MatSeqSELLHIPCopyToGPU(A));
726:   if (a->nz) {
727:     PetscInt nblocks, blocksize = 512;
728:     dim3     block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
729:     PetscCall(VecHIPGetArrayRead(xx, &x));
730:     PetscCall(VecHIPGetArrayRead(yy, &y));
731:     PetscCall(VecHIPGetArrayWrite(zz, &z));
732:     PetscCall(PetscLogGpuTimeBegin());

734:     switch (hipstruct->kernelchoice) {
735: #if !defined(PETSC_USE_COMPLEX)
736:     case 9:
737:       nblocks = 1 + (nrows - 1) / sliceheight;
738:       if (blocky == 2) {
739:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
740:       } else if (blocky == 4) {
741:         matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742:       } else if (blocky == 8) {
743:         matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744:       } else if (blocky == 16) {
745:         matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746:       } else {
747:         matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748:       }
749:       break;
750:     case 8:
751:       /* each block handles approximately one slice */
752:       nchunks         = hipstruct->totalchunks;
753:       blocky          = a->chunksize / 32;
754:       chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
755:       nblocks         = 1 + (nchunks - 1) / chunksperblock;
756:       chunk_slice_map = hipstruct->chunk_slice_map;
757:       if (blocky == 2) {
758:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
759:       } else if (blocky == 4) {
760:         matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761:       } else if (blocky == 8) {
762:         matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763:       } else if (blocky == 16) {
764:         matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765:       } else {
766:         matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767:       }
768:       break;
769:     case 7:
770:       nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
771:       if (blocky == 2) {
772:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
773:       } else if (blocky == 4) {
774:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775:       } else if (blocky == 8) {
776:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777:       } else if (blocky == 16) {
778:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779:       } else {
780:         nblocks = 1 + (nrows - 1) / (2 * sliceheight);
781:         matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
782:       }
783:       break;
784: #endif
785:     case 6:
786:       nblocks = 1 + (nrows - 1) / (blocksize / 32);
787:       matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
788:       break;
789:     case 5:
790:       nblocks = 1 + (nrows - 1) / (blocksize / 16);
791:       matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
792:       break;
793:     case 4:
794:       nblocks = 1 + (nrows - 1) / (blocksize / 8);
795:       matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
796:       break;
797:     case 3:
798:       nblocks = 1 + (nrows - 1) / (blocksize / 4);
799:       matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
800:       break;
801:     case 2:
802:       nblocks = 1 + (nrows - 1) / (blocksize / 2);
803:       matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
804:       break;
805:     case 1:
806:       nblocks = 1 + (nrows - 1) / blocksize;
807:       matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
808:       break;
809: #if !defined(PETSC_USE_COMPLEX)
810:     case 0:
811:       maxoveravg = a->maxslicewidth / a->avgslicewidth;
812:       if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
813:         /* each block handles approximately one slice */
814:         nchunks         = hipstruct->totalchunks;
815:         blocky          = a->chunksize / 32;
816:         chunksperblock  = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
817:         nblocks         = 1 + (nchunks - 1) / chunksperblock;
818:         chunk_slice_map = hipstruct->chunk_slice_map;
819:         if (blocky == 2) {
820:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
821:         } else if (blocky == 4) {
822:           matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823:         } else if (blocky == 8) {
824:           matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825:         } else if (blocky == 16) {
826:           matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827:         } else {
828:           matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829:         }
830:       } else {
831:         PetscInt avgslicesize = sliceheight * a->avgslicewidth;
832:         if (avgslicesize <= 432) {
833:           if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
834:             nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
835:             matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
836:           } else {
837:             nblocks = 1 + (nrows - 1) / sliceheight;
838:             matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
839:           }
840:         } else if (avgslicesize <= 2400) {
841:           nblocks = 1 + (nrows - 1) / sliceheight;
842:           matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
843:         } else {
844:           nblocks = 1 + (nrows - 1) / sliceheight;
845:           matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
846:         }
847:       }
848:       break;
849: #endif
850:     default:
851:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
852:     }
853:     PetscCall(PetscLogGpuTimeEnd());
854:     PetscCall(VecHIPRestoreArrayRead(xx, &x));
855:     PetscCall(VecHIPRestoreArrayRead(yy, &y));
856:     PetscCall(VecHIPRestoreArrayWrite(zz, &z));
857:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
858:   } else {
859:     PetscCall(VecCopy(yy, zz));
860:   }
861:   PetscFunctionReturn(PETSC_SUCCESS);
862: }

864: static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems *PetscOptionsObject)
865: {
866:   Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
867:   PetscInt        kernel, blocky;
868:   PetscBool       flg;

870:   PetscFunctionBegin;
871:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
872:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
873:   if (flg) {
874:     PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
875:     hipstruct->blocky = blocky;
876:   }
877:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
878:   if (flg) {
879:     PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
880:     hipstruct->kernelchoice = kernel;
881:     if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); }
882:   }
883:   PetscOptionsHeadEnd();
884:   PetscFunctionReturn(PETSC_SUCCESS);
885: }

887: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
888: {
889:   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;

891:   PetscFunctionBegin;
892:   PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
893:   PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
894:   PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
895:   PetscFunctionReturn(PETSC_SUCCESS);
896: }

898: static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
899: {
900:   PetscFunctionBegin;
901:   PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
902:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
903:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
904:   if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLHIPCopyToGPU(A)); }
905:   A->ops->mult    = MatMult_SeqSELLHIP;
906:   A->ops->multadd = MatMultAdd_SeqSELLHIP;
907:   PetscFunctionReturn(PETSC_SUCCESS);
908: }

910: static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
911: {
912:   PetscBool    both = PETSC_FALSE;
913:   Mat_SeqSELL *a    = (Mat_SeqSELL *)A->data;

915:   PetscFunctionBegin;
916:   if (A->factortype == MAT_FACTOR_NONE) {
917:     Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
918:     if (hipstruct->val) {
919:       both = PETSC_TRUE;
920:       PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
921:     }
922:   }
923:   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
924:   PetscCall(MatSeqSELLInvalidateDiagonal(A));
925:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
926:   else A->offloadmask = PETSC_OFFLOAD_CPU;
927:   PetscFunctionReturn(PETSC_SUCCESS);
928: }

930: static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
931: {
932:   PetscFunctionBegin;
933:   if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
934:   PetscCall(MatDestroy_SeqSELL(A));
935:   PetscFunctionReturn(PETSC_SUCCESS);
936: }

938: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
939: static PetscErrorCode       MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
940: {
941:   PetscFunctionBegin;
942:   PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
943:   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
944:   PetscFunctionReturn(PETSC_SUCCESS);
945: }

947: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
948: {
949:   Mat_SeqSELLHIP *hipstruct;

951:   PetscFunctionBegin;
952:   PetscCall(PetscFree(B->defaultvectype));
953:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));

955:   if (!B->spptr) {
956:     if (B->factortype == MAT_FACTOR_NONE) {
957:       PetscCall(PetscNew(&hipstruct));
958:       B->spptr = hipstruct;
959:     }
960:   }

962:   B->ops->assemblyend    = MatAssemblyEnd_SeqSELLHIP;
963:   B->ops->destroy        = MatDestroy_SeqSELLHIP;
964:   B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
965:   B->ops->mult           = MatMult_SeqSELLHIP;
966:   B->ops->multadd        = MatMultAdd_SeqSELLHIP;
967:   B->ops->duplicate      = MatDuplicate_SeqSELLHIP;
968:   B->ops->zeroentries    = MatZeroEntries_SeqSELLHIP;

970:   /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
971:   PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));

973:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
974:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
975:   PetscFunctionReturn(PETSC_SUCCESS);
976: }

978: /*MC
979:   MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.

981:   Options Database Keys:
982: +  -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
983: .  -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
984: -  -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)

986:   Level: beginner

988: .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
989: M*/

991: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
992: {
993:   PetscFunctionBegin;
994:   PetscCall(MatCreate_SeqSELL(B));
995:   PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
996:   PetscCall(MatSetFromOptions(B));
997:   PetscFunctionReturn(PETSC_SUCCESS);
998: }