Actual source code: sellhip.hip.cpp
1: #include "hip/hip_runtime.h"
2: #include <hip/hip_runtime.h>
4: #include <petscdevice_hip.h>
5: #include <petsc/private/cupmatomics.hpp>
6: #include <../src/mat/impls/sell/seq/sell.h>
8: #define WARP_SIZE 64
10: typedef struct {
11: PetscInt maxallocmat;
12: PetscInt totalentries;
13: PetscInt *colidx; /* column index array, device pointer */
14: MatScalar *val; /* value array, device pointer */
15: PetscInt totalslices;
16: PetscInt *sliidx; /* slice index array, device pointer */
17: PetscInt nonzerostate;
18: PetscInt kernelchoice;
19: PetscInt blocky;
20: PetscInt chunksperblock;
21: PetscInt totalchunks;
22: PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
23: } Mat_SeqSELLHIP;
25: static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
26: {
27: PetscFunctionBegin;
28: if (*hipstruct) {
29: if ((*hipstruct)->colidx) { PetscCallHIP(hipFree((*hipstruct)->colidx)); }
30: if ((*hipstruct)->val) { PetscCallHIP(hipFree((*hipstruct)->val)); }
31: if ((*hipstruct)->sliidx) { PetscCallHIP(hipFree((*hipstruct)->sliidx)); }
32: if ((*hipstruct)->chunk_slice_map) { PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); }
33: PetscCall(PetscFree(*hipstruct));
34: }
35: PetscFunctionReturn(PETSC_SUCCESS);
36: }
38: static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
39: {
40: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
41: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
43: PetscFunctionBegin;
44: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
45: PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
46: if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
47: /* copy values only */
48: PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
49: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
50: } else {
51: if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
52: if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
53: if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
54: if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
55: hipstruct->maxallocmat = a->maxallocmat;
56: hipstruct->totalentries = a->sliidx[a->totalslices];
57: hipstruct->totalslices = a->totalslices;
58: hipstruct->totalchunks = a->totalchunks;
59: PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
60: PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
61: /* copy values, nz or maxallocmat? */
62: PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
63: PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));
65: PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
66: PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
67: PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
68: PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
69: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
70: }
71: PetscCallHIP(WaitForHIP());
72: PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
73: A->offloadmask = PETSC_OFFLOAD_BOTH;
74: }
75: PetscFunctionReturn(PETSC_SUCCESS);
76: }
78: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
79: {
80: PetscInt i, row, slice_id, row_in_slice;
81: MatScalar sum;
82: /* one thread per row. */
83: row = blockIdx.x * blockDim.x + threadIdx.x;
84: if (row < nrows) {
85: slice_id = row / sliceheight;
86: row_in_slice = row % sliceheight;
87: sum = 0.0;
88: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89: y[row] = sum;
90: }
91: }
93: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
94: {
95: PetscInt i, row, slice_id, row_in_slice;
96: MatScalar sum;
97: /* one thread per row. */
98: row = blockIdx.x * blockDim.x + threadIdx.x;
99: if (row < nrows) {
100: slice_id = row / sliceheight;
101: row_in_slice = row % sliceheight;
102: sum = 0.0;
103: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104: z[row] = y[row] + sum;
105: }
106: }
108: #if !defined(PETSC_USE_COMPLEX)
109: /* use 1 block per slice, suitable for large slice width */
110: template <int BLOCKY>
111: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
112: {
113: __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
114: PetscInt i, row, slice_id = blockIdx.x;
115: int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
116: /* transposed index */
117: int tidx = tid % BLOCKY;
118: int tidy = tid / BLOCKY;
119: PetscScalar t = 0.0;
121: row = slice_id * sliceheight + threadIdx.x % sliceheight;
122: if (row < nrows) {
123: for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
124: }
125: #pragma unroll
126: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
127: /* transpose layout to reduce each row using warp shfl */
128: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
129: __syncthreads();
130: if (tidy < sliceheight) t = shared[tidy][tidx];
131: #pragma unroll
132: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
133: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
134: __syncthreads();
135: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
136: }
138: /* use 1 block per slice, suitable for large slice width */
139: template <int BLOCKY>
140: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
141: {
142: __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
143: PetscInt i, row, slice_id = blockIdx.x;
144: int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
145: /* transposed index */
146: int tidx = tid % BLOCKY;
147: int tidy = tid / BLOCKY;
148: PetscScalar t = 0.0;
150: row = slice_id * sliceheight + threadIdx.x % sliceheight;
151: if (row < nrows) {
152: for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
153: }
154: #pragma unroll
155: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
156: /* transpose layout to reduce each row using warp shfl */
157: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
158: __syncthreads();
159: if (tidy < sliceheight) t = shared[tidy][tidx];
160: #pragma unroll
161: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
162: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
163: __syncthreads();
164: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
165: }
167: template <int BLOCKY>
168: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
169: {
170: bool head = true;
171: #pragma unroll
172: for (int i = 1; i < BLOCKY * 2; i <<= 1) {
173: int halfwarpid = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
174: shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
175: if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
176: shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
177: if (i == 1) head = false;
178: }
179: __syncthreads();
180: if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
181: __syncthreads();
182: }
183: return head;
184: }
186: /* load-balancing version. Chunksize is equal to the number of threads per block */
187: template <int BLOCKY>
188: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
189: {
190: __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
191: PetscInt gid, row, start_slice, cid;
192: PetscScalar t = 0.0;
193: AtomicAdd<MatScalar> atomAdd;
194: /* zero out y */
195: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
196: gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
197: if (gid < nrows) y[gid] = 0.0;
198: }
199: for (int iter = 0; iter < chunksperblock; iter++) {
200: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
201: if (cid < totalchunks) {
202: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
203: gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
204: if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
205: __shared__ PetscInt flag[BLOCKY * 2];
206: bool write;
207: PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
208: /* find out the slice that this element belongs to */
209: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
210: if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
211: row = slice_id * sliceheight + threadIdx.x % sliceheight;
212: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
213: __syncthreads();
214: write = segment_scan<BLOCKY>(flag, shared, &t);
215: if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
216: t = 0.0;
217: } else { /* this iteration covers only one slice */
218: row = start_slice * sliceheight + threadIdx.x % sliceheight;
219: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
220: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
221: int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
222: /* reduction and write to output vector */
223: #pragma unroll
224: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
225: /* transpose layout to reduce each row using warp shfl */
226: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
227: __syncthreads();
228: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
229: #pragma unroll
230: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
231: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
232: __syncthreads();
233: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
234: t = 0.0;
235: }
236: }
237: }
238: }
239: }
241: /* load-balancing version. Chunksize is equal to the number of threads per block */
242: template <int BLOCKY>
243: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
244: {
245: __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
246: PetscInt gid, row, start_slice, cid;
247: PetscScalar t = 0.0;
248: AtomicAdd<MatScalar> atomAdd;
249: /* copy y to z */
250: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
251: gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
252: if (gid < nrows) z[gid] = y[gid];
253: }
254: for (int iter = 0; iter < chunksperblock; iter++) {
255: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
256: if (cid < totalchunks) {
257: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
258: gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
259: if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
260: __shared__ PetscInt flag[BLOCKY * 2];
261: bool write;
262: PetscInt slice_id = start_slice, totalslices = PetscCeilInt(nrows, sliceheight), totalentries = sliidx[totalslices];
263: /* find out the slice that this element belongs to */
264: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
265: if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
266: row = slice_id * sliceheight + threadIdx.x % sliceheight;
267: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
268: __syncthreads();
269: write = segment_scan<BLOCKY>(flag, shared, &t);
270: if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
271: t = 0.0;
272: } else { /* this iteration covers only one slice */
273: row = start_slice * sliceheight + threadIdx.x % sliceheight;
274: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
275: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
276: int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
277: /* reduction and write to output vector */
278: #pragma unroll
279: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
280: /* transpose layout to reduce each row using warp shfl */
281: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
282: __syncthreads();
283: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
284: #pragma unroll
285: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
286: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
287: __syncthreads();
288: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
289: t = 0.0;
290: }
291: }
292: }
293: }
294: }
296: /* use 1 warp per slice, suitable for small slice width */
297: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
298: {
299: PetscInt i, row, slice_id;
300: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
301: row = slice_id * sliceheight + threadIdx.x % sliceheight;
302: double t = 0.0;
303: if (row < nrows) {
304: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
305: }
306: #pragma unroll
307: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
308: if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
309: }
311: /* use 1 warp per slice, suitable for small slice width */
312: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
313: {
314: PetscInt i, row, slice_id;
315: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
316: row = slice_id * sliceheight + threadIdx.x % sliceheight;
317: double t = 0.0;
318: if (row < nrows) {
319: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
320: }
321: #pragma unroll
322: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
323: if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
324: }
325: #endif
327: /*********** Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison **********/
329: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
330: {
331: __shared__ MatScalar shared[32 * 16];
332: PetscInt i, row, slice_id, row_in_slice;
333: /* multiple threads per row. */
334: row = blockIdx.x * blockDim.x + threadIdx.x;
335: if (row < nrows) {
336: slice_id = row / sliceheight;
337: row_in_slice = row % sliceheight;
339: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
340: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
341: __syncthreads();
342: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
343: __syncthreads();
344: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
345: __syncthreads();
346: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
347: __syncthreads();
348: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
349: __syncthreads();
350: if (threadIdx.y < 1) {
351: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
352: y[row] = shared[threadIdx.x];
353: }
354: }
355: }
357: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
358: {
359: __shared__ MatScalar shared[32 * 16];
360: PetscInt i, row, slice_id, row_in_slice;
361: /* multiple threads per row. */
362: row = blockIdx.x * blockDim.x + threadIdx.x;
363: if (row < nrows) {
364: slice_id = row / sliceheight;
365: row_in_slice = row % sliceheight;
367: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
368: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
369: __syncthreads();
370: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
371: __syncthreads();
372: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
373: __syncthreads();
374: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
375: __syncthreads();
376: if (threadIdx.y < 1) {
377: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
378: y[row] = shared[threadIdx.x];
379: }
380: }
381: }
383: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
384: {
385: __shared__ MatScalar shared[32 * 16];
386: PetscInt i, row, slice_id, row_in_slice;
387: /* multiple threads per row. */
388: row = blockIdx.x * blockDim.x + threadIdx.x;
389: if (row < nrows) {
390: slice_id = row / sliceheight;
391: row_in_slice = row % sliceheight;
393: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
394: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
395: __syncthreads();
396: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
397: __syncthreads();
398: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
399: __syncthreads();
400: if (threadIdx.y < 1) {
401: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
402: y[row] = shared[threadIdx.x];
403: }
404: }
405: }
407: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
408: {
409: __shared__ MatScalar shared[32 * 16];
410: PetscInt i, row, slice_id, row_in_slice;
411: /* multiple threads per row. */
412: row = blockIdx.x * blockDim.x + threadIdx.x;
413: if (row < nrows) {
414: slice_id = row / sliceheight;
415: row_in_slice = row % sliceheight;
417: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
418: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
419: __syncthreads();
420: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
421: __syncthreads();
422: if (threadIdx.y < 1) {
423: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
424: y[row] = shared[threadIdx.x];
425: }
426: }
427: }
429: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
430: {
431: __shared__ MatScalar shared[32 * 16];
432: PetscInt i, row, slice_id, row_in_slice;
433: /* multiple threads per row. */
434: row = blockIdx.x * blockDim.x + threadIdx.x;
435: if (row < nrows) {
436: slice_id = row / sliceheight;
437: row_in_slice = row % sliceheight;
439: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
440: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
441: __syncthreads();
442: if (threadIdx.y < 1) {
443: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
444: y[row] = shared[threadIdx.x];
445: }
446: }
447: }
449: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
450: {
451: __shared__ MatScalar shared[32 * 16];
452: PetscInt i, row, slice_id, row_in_slice;
453: /* multiple threads per row. */
454: row = blockIdx.x * blockDim.x + threadIdx.x;
455: if (row < nrows) {
456: slice_id = row / sliceheight;
457: row_in_slice = row % sliceheight;
459: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
460: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
461: __syncthreads();
462: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
463: __syncthreads();
464: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
465: __syncthreads();
466: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
467: __syncthreads();
468: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
469: __syncthreads();
470: if (threadIdx.y < 1) {
471: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
472: z[row] = y[row] + shared[threadIdx.x];
473: }
474: }
475: }
477: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
478: {
479: __shared__ MatScalar shared[32 * 16];
480: PetscInt i, row, slice_id, row_in_slice;
481: /* multiple threads per row. */
482: row = blockIdx.x * blockDim.x + threadIdx.x;
483: if (row < nrows) {
484: slice_id = row / sliceheight;
485: row_in_slice = row % sliceheight;
487: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
488: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
489: __syncthreads();
490: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
491: __syncthreads();
492: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
493: __syncthreads();
494: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
495: __syncthreads();
496: if (threadIdx.y < 1) {
497: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
498: z[row] = y[row] + shared[threadIdx.x];
499: }
500: }
501: }
503: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
504: {
505: __shared__ MatScalar shared[32 * 16];
506: PetscInt i, row, slice_id, row_in_slice;
507: /* multiple threads per row. */
508: row = blockIdx.x * blockDim.x + threadIdx.x;
509: if (row < nrows) {
510: slice_id = row / sliceheight;
511: row_in_slice = row % sliceheight;
513: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
514: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
515: __syncthreads();
516: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
517: __syncthreads();
518: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
519: __syncthreads();
520: if (threadIdx.y < 1) {
521: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
522: z[row] = y[row] + shared[threadIdx.x];
523: }
524: }
525: }
527: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
528: {
529: __shared__ MatScalar shared[32 * 16];
530: PetscInt i, row, slice_id, row_in_slice;
531: /* multiple threads per row. */
532: row = blockIdx.x * blockDim.x + threadIdx.x;
533: if (row < nrows) {
534: slice_id = row / sliceheight;
535: row_in_slice = row % sliceheight;
537: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
538: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
539: __syncthreads();
540: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
541: __syncthreads();
542: if (threadIdx.y < 1) {
543: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
544: z[row] = y[row] + shared[threadIdx.x];
545: }
546: }
547: }
549: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
550: {
551: __shared__ MatScalar shared[32 * 16];
552: PetscInt i, row, slice_id, row_in_slice;
553: /* multiple threads per row. */
554: row = blockIdx.x * blockDim.x + threadIdx.x;
555: if (row < nrows) {
556: slice_id = row / sliceheight;
557: row_in_slice = row % sliceheight;
559: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
560: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
561: __syncthreads();
562: if (threadIdx.y < 1) {
563: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
564: z[row] = y[row] + shared[threadIdx.x];
565: }
566: }
567: }
569: static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
570: {
571: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
572: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
573: PetscScalar *y;
574: const PetscScalar *x;
575: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
576: MatScalar *aval;
577: PetscInt *acolidx;
578: PetscInt *sliidx;
579: PetscInt nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
580: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
581: #if !defined(PETSC_USE_COMPLEX)
582: PetscInt chunksperblock, nchunks, *chunk_slice_map;
583: PetscReal maxoveravg;
584: #endif
586: PetscFunctionBegin;
587: PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
588: PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
589: PetscCall(MatSeqSELLHIPCopyToGPU(A));
590: /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
591: aval = hipstruct->val;
592: acolidx = hipstruct->colidx;
593: sliidx = hipstruct->sliidx;
595: PetscCall(VecHIPGetArrayRead(xx, &x));
596: PetscCall(VecHIPGetArrayWrite(yy, &y));
597: PetscCall(PetscLogGpuTimeBegin());
599: switch (hipstruct->kernelchoice) {
600: #if !defined(PETSC_USE_COMPLEX)
601: case 9: /* 1 slice per block */
602: nblocks = 1 + (nrows - 1) / sliceheight;
603: if (hipstruct->blocky == 2) {
604: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
605: } else if (hipstruct->blocky == 4) {
606: matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607: } else if (hipstruct->blocky == 8) {
608: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609: } else if (hipstruct->blocky == 16) {
610: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611: } else {
612: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613: }
614: break;
615: case 7: /* each block handles blocky slices */
616: nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
617: if (hipstruct->blocky == 2) {
618: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
619: } else if (hipstruct->blocky == 4) {
620: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621: } else if (hipstruct->blocky == 8) {
622: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623: } else if (hipstruct->blocky == 16) {
624: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625: } else {
626: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
627: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628: }
629: break;
630: #endif
631: case 6:
632: nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
633: matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
634: break;
635: case 5:
636: nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
637: matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
638: break;
639: case 4:
640: nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
641: matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
642: break;
643: case 3:
644: nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
645: matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
646: break;
647: case 2: /* 16 slices per block if sliceheight=32 */
648: nblocks = 1 + (nrows - 1) / (blocksize / 2);
649: matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
650: break;
651: case 1: /* 32 slices per block if sliceheight=32 */
652: nblocks = 1 + (nrows - 1) / blocksize;
653: matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654: break;
655: #if !defined(PETSC_USE_COMPLEX)
656: case 0:
657: maxoveravg = a->maxslicewidth / a->avgslicewidth;
658: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659: /* each block handles approximately one slice */
660: PetscInt blocky = a->chunksize / 32;
661: nchunks = hipstruct->totalchunks;
662: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
663: nblocks = 1 + (nchunks - 1) / chunksperblock;
664: chunk_slice_map = hipstruct->chunk_slice_map;
665: if (blocky == 2) {
666: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667: } else if (blocky == 4) {
668: matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669: } else if (blocky == 8) {
670: matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671: } else if (blocky == 16) {
672: matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673: } else {
674: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675: }
676: } else {
677: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
678: if (avgslicesize <= 432) {
679: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
680: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
681: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
682: } else {
683: nblocks = 1 + (nrows - 1) / sliceheight;
684: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
685: }
686: } else if (avgslicesize <= 2400) {
687: nblocks = 1 + (nrows - 1) / sliceheight;
688: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
689: } else {
690: nblocks = 1 + (nrows - 1) / sliceheight;
691: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
692: }
693: }
694: break;
695: #endif
696: default:
697: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
698: }
699: PetscCall(PetscLogGpuTimeEnd());
700: PetscCall(VecHIPRestoreArrayRead(xx, &x));
701: PetscCall(VecHIPRestoreArrayWrite(yy, &y));
702: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
703: PetscFunctionReturn(PETSC_SUCCESS);
704: }
706: static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
707: {
708: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
709: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
710: PetscScalar *z;
711: const PetscScalar *y, *x;
712: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
713: MatScalar *aval = hipstruct->val;
714: PetscInt *acolidx = hipstruct->colidx;
715: PetscInt *sliidx = hipstruct->sliidx;
716: #if !defined(PETSC_USE_COMPLEX)
717: PetscReal maxoveravg;
718: PetscInt chunksperblock, nchunks, *chunk_slice_map;
719: PetscInt blocky = hipstruct->blocky;
720: #endif
722: PetscFunctionBegin;
723: PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
724: PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
725: PetscCall(MatSeqSELLHIPCopyToGPU(A));
726: if (a->nz) {
727: PetscInt nblocks, blocksize = 512;
728: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
729: PetscCall(VecHIPGetArrayRead(xx, &x));
730: PetscCall(VecHIPGetArrayRead(yy, &y));
731: PetscCall(VecHIPGetArrayWrite(zz, &z));
732: PetscCall(PetscLogGpuTimeBegin());
734: switch (hipstruct->kernelchoice) {
735: #if !defined(PETSC_USE_COMPLEX)
736: case 9:
737: nblocks = 1 + (nrows - 1) / sliceheight;
738: if (blocky == 2) {
739: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
740: } else if (blocky == 4) {
741: matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742: } else if (blocky == 8) {
743: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744: } else if (blocky == 16) {
745: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746: } else {
747: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748: }
749: break;
750: case 8:
751: /* each block handles approximately one slice */
752: nchunks = hipstruct->totalchunks;
753: blocky = a->chunksize / 32;
754: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
755: nblocks = 1 + (nchunks - 1) / chunksperblock;
756: chunk_slice_map = hipstruct->chunk_slice_map;
757: if (blocky == 2) {
758: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
759: } else if (blocky == 4) {
760: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761: } else if (blocky == 8) {
762: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763: } else if (blocky == 16) {
764: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765: } else {
766: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767: }
768: break;
769: case 7:
770: nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
771: if (blocky == 2) {
772: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
773: } else if (blocky == 4) {
774: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775: } else if (blocky == 8) {
776: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777: } else if (blocky == 16) {
778: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779: } else {
780: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
781: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
782: }
783: break;
784: #endif
785: case 6:
786: nblocks = 1 + (nrows - 1) / (blocksize / 32);
787: matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
788: break;
789: case 5:
790: nblocks = 1 + (nrows - 1) / (blocksize / 16);
791: matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
792: break;
793: case 4:
794: nblocks = 1 + (nrows - 1) / (blocksize / 8);
795: matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
796: break;
797: case 3:
798: nblocks = 1 + (nrows - 1) / (blocksize / 4);
799: matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
800: break;
801: case 2:
802: nblocks = 1 + (nrows - 1) / (blocksize / 2);
803: matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
804: break;
805: case 1:
806: nblocks = 1 + (nrows - 1) / blocksize;
807: matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
808: break;
809: #if !defined(PETSC_USE_COMPLEX)
810: case 0:
811: maxoveravg = a->maxslicewidth / a->avgslicewidth;
812: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
813: /* each block handles approximately one slice */
814: nchunks = hipstruct->totalchunks;
815: blocky = a->chunksize / 32;
816: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
817: nblocks = 1 + (nchunks - 1) / chunksperblock;
818: chunk_slice_map = hipstruct->chunk_slice_map;
819: if (blocky == 2) {
820: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
821: } else if (blocky == 4) {
822: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823: } else if (blocky == 8) {
824: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825: } else if (blocky == 16) {
826: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827: } else {
828: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829: }
830: } else {
831: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
832: if (avgslicesize <= 432) {
833: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
834: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
835: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
836: } else {
837: nblocks = 1 + (nrows - 1) / sliceheight;
838: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
839: }
840: } else if (avgslicesize <= 2400) {
841: nblocks = 1 + (nrows - 1) / sliceheight;
842: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
843: } else {
844: nblocks = 1 + (nrows - 1) / sliceheight;
845: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
846: }
847: }
848: break;
849: #endif
850: default:
851: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
852: }
853: PetscCall(PetscLogGpuTimeEnd());
854: PetscCall(VecHIPRestoreArrayRead(xx, &x));
855: PetscCall(VecHIPRestoreArrayRead(yy, &y));
856: PetscCall(VecHIPRestoreArrayWrite(zz, &z));
857: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
858: } else {
859: PetscCall(VecCopy(yy, zz));
860: }
861: PetscFunctionReturn(PETSC_SUCCESS);
862: }
864: static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems *PetscOptionsObject)
865: {
866: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
867: PetscInt kernel, blocky;
868: PetscBool flg;
870: PetscFunctionBegin;
871: PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
872: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
873: if (flg) {
874: PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
875: hipstruct->blocky = blocky;
876: }
877: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
878: if (flg) {
879: PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
880: hipstruct->kernelchoice = kernel;
881: if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); }
882: }
883: PetscOptionsHeadEnd();
884: PetscFunctionReturn(PETSC_SUCCESS);
885: }
887: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
888: {
889: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
891: PetscFunctionBegin;
892: PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
893: PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
894: PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
895: PetscFunctionReturn(PETSC_SUCCESS);
896: }
898: static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
899: {
900: PetscFunctionBegin;
901: PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
902: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
903: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
904: if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLHIPCopyToGPU(A)); }
905: A->ops->mult = MatMult_SeqSELLHIP;
906: A->ops->multadd = MatMultAdd_SeqSELLHIP;
907: PetscFunctionReturn(PETSC_SUCCESS);
908: }
910: static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
911: {
912: PetscBool both = PETSC_FALSE;
913: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915: PetscFunctionBegin;
916: if (A->factortype == MAT_FACTOR_NONE) {
917: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
918: if (hipstruct->val) {
919: both = PETSC_TRUE;
920: PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
921: }
922: }
923: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
924: PetscCall(MatSeqSELLInvalidateDiagonal(A));
925: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
926: else A->offloadmask = PETSC_OFFLOAD_CPU;
927: PetscFunctionReturn(PETSC_SUCCESS);
928: }
930: static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
931: {
932: PetscFunctionBegin;
933: if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
934: PetscCall(MatDestroy_SeqSELL(A));
935: PetscFunctionReturn(PETSC_SUCCESS);
936: }
938: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
939: static PetscErrorCode MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
940: {
941: PetscFunctionBegin;
942: PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
943: PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
944: PetscFunctionReturn(PETSC_SUCCESS);
945: }
947: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
948: {
949: Mat_SeqSELLHIP *hipstruct;
951: PetscFunctionBegin;
952: PetscCall(PetscFree(B->defaultvectype));
953: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
955: if (!B->spptr) {
956: if (B->factortype == MAT_FACTOR_NONE) {
957: PetscCall(PetscNew(&hipstruct));
958: B->spptr = hipstruct;
959: }
960: }
962: B->ops->assemblyend = MatAssemblyEnd_SeqSELLHIP;
963: B->ops->destroy = MatDestroy_SeqSELLHIP;
964: B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
965: B->ops->mult = MatMult_SeqSELLHIP;
966: B->ops->multadd = MatMultAdd_SeqSELLHIP;
967: B->ops->duplicate = MatDuplicate_SeqSELLHIP;
968: B->ops->zeroentries = MatZeroEntries_SeqSELLHIP;
970: /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
971: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
973: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
974: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
975: PetscFunctionReturn(PETSC_SUCCESS);
976: }
978: /*MC
979: MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.
981: Options Database Keys:
982: + -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
983: . -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
984: - -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)
986: Level: beginner
988: .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
989: M*/
991: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
992: {
993: PetscFunctionBegin;
994: PetscCall(MatCreate_SeqSELL(B));
995: PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
996: PetscCall(MatSetFromOptions(B));
997: PetscFunctionReturn(PETSC_SUCCESS);
998: }