Actual source code: sellhip.hip.cpp
1: #include "hip/hip_runtime.h"
2: #include <hip/hip_runtime.h>
4: #include <petscdevice_hip.h>
5: #include <petsc/private/cupmatomics.hpp>
6: #include <../src/mat/impls/sell/seq/sell.h>
8: #define WARP_SIZE 64
10: typedef struct {
11: PetscInt maxallocmat;
12: PetscInt totalentries;
13: PetscInt *colidx; /* column index array, device pointer */
14: MatScalar *val; /* value array, device pointer */
15: PetscInt totalslices;
16: PetscInt *sliidx; /* slice index array, device pointer */
17: PetscInt nonzerostate;
18: PetscInt kernelchoice;
19: PetscInt blocky;
20: PetscInt chunksperblock;
21: PetscInt totalchunks;
22: PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
23: } Mat_SeqSELLHIP;
25: static PetscErrorCode MatSeqSELLHIP_Destroy(Mat_SeqSELLHIP **hipstruct)
26: {
27: PetscFunctionBegin;
28: if (*hipstruct) {
29: if ((*hipstruct)->colidx) { PetscCallHIP(hipFree((*hipstruct)->colidx)); }
30: if ((*hipstruct)->val) { PetscCallHIP(hipFree((*hipstruct)->val)); }
31: if ((*hipstruct)->sliidx) { PetscCallHIP(hipFree((*hipstruct)->sliidx)); }
32: if ((*hipstruct)->chunk_slice_map) { PetscCallHIP(hipFree((*hipstruct)->chunk_slice_map)); }
33: PetscCall(PetscFree(*hipstruct));
34: }
35: PetscFunctionReturn(PETSC_SUCCESS);
36: }
38: static PetscErrorCode MatSeqSELLHIPCopyToGPU(Mat A)
39: {
40: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
41: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
43: PetscFunctionBegin;
44: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
45: PetscCall(PetscLogEventBegin(MAT_HIPCopyToGPU, A, 0, 0, 0));
46: if (A->assembled && A->nonzerostate == hipstruct->nonzerostate) {
47: /* copy values only */
48: PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), hipMemcpyHostToDevice));
49: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
50: } else {
51: if (hipstruct->colidx) PetscCallHIP(hipFree(hipstruct->colidx));
52: if (hipstruct->val) PetscCallHIP(hipFree(hipstruct->val));
53: if (hipstruct->sliidx) PetscCallHIP(hipFree(hipstruct->sliidx));
54: if (hipstruct->chunk_slice_map) PetscCallHIP(hipFree(hipstruct->chunk_slice_map));
55: hipstruct->maxallocmat = a->maxallocmat;
56: hipstruct->totalentries = a->sliidx[a->totalslices];
57: hipstruct->totalslices = a->totalslices;
58: hipstruct->totalchunks = a->totalchunks;
59: PetscCallHIP(hipMalloc((void **)&hipstruct->colidx, a->maxallocmat * sizeof(*hipstruct->colidx)));
60: PetscCallHIP(hipMalloc((void **)&hipstruct->val, a->maxallocmat * sizeof(*hipstruct->val)));
61: /* copy values, nz or maxallocmat? */
62: PetscCallHIP(hipMemcpy(hipstruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), hipMemcpyHostToDevice));
63: PetscCallHIP(hipMemcpy(hipstruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), hipMemcpyHostToDevice));
65: PetscCallHIP(hipMalloc((void **)&hipstruct->sliidx, (a->totalslices + 1) * sizeof(*hipstruct->sliidx)));
66: PetscCallHIP(hipMemcpy(hipstruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), hipMemcpyHostToDevice));
67: PetscCallHIP(hipMalloc((void **)&hipstruct->chunk_slice_map, a->totalchunks * sizeof(*hipstruct->chunk_slice_map)));
68: PetscCallHIP(hipMemcpy(hipstruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), hipMemcpyHostToDevice));
69: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
70: }
71: PetscCallHIP(WaitForHIP());
72: PetscCall(PetscLogEventEnd(MAT_HIPCopyToGPU, A, 0, 0, 0));
73: A->offloadmask = PETSC_OFFLOAD_BOTH;
74: }
75: PetscFunctionReturn(PETSC_SUCCESS);
76: }
78: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
79: {
80: PetscInt i, row, slice_id, row_in_slice;
81: MatScalar sum;
82: /* one thread per row. */
83: row = blockIdx.x * blockDim.x + threadIdx.x;
84: if (row < nrows) {
85: slice_id = row / sliceheight;
86: row_in_slice = row % sliceheight;
87: sum = 0.0;
88: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
89: y[row] = sum;
90: }
91: }
93: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
94: {
95: PetscInt i, row, slice_id, row_in_slice;
96: MatScalar sum;
97: /* one thread per row. */
98: row = blockIdx.x * blockDim.x + threadIdx.x;
99: if (row < nrows) {
100: slice_id = row / sliceheight;
101: row_in_slice = row % sliceheight;
102: sum = 0.0;
103: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
104: z[row] = y[row] + sum;
105: }
106: }
108: #if !defined(PETSC_USE_COMPLEX)
109: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wpass-failed")
110: /* use 1 block per slice, suitable for large slice width */
111: template <int BLOCKY>
112: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
113: {
114: __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
115: PetscInt i, row, slice_id = blockIdx.x;
116: int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
117: /* transposed index */
118: int tidx = tid % BLOCKY;
119: int tidy = tid / BLOCKY;
120: PetscScalar t = 0.0;
122: row = slice_id * sliceheight + threadIdx.x % sliceheight;
123: if (row < nrows) {
124: for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
125: }
126: #pragma unroll
127: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
128: /* transpose layout to reduce each row using warp shfl */
129: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
130: __syncthreads();
131: if (tidy < sliceheight) t = shared[tidy][tidx];
132: #pragma unroll
133: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
134: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
135: __syncthreads();
136: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
137: }
139: /* use 1 block per slice, suitable for large slice width */
140: template <int BLOCKY>
141: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
142: {
143: __shared__ MatScalar shared[WARP_SIZE][BLOCKY];
144: PetscInt i, row, slice_id = blockIdx.x;
145: int tid = threadIdx.x + threadIdx.y * WARP_SIZE;
146: /* transposed index */
147: int tidx = tid % BLOCKY;
148: int tidy = tid / BLOCKY;
149: PetscScalar t = 0.0;
151: row = slice_id * sliceheight + threadIdx.x % sliceheight;
152: if (row < nrows) {
153: for (i = sliidx[slice_id] + threadIdx.x + WARP_SIZE * threadIdx.y; i < sliidx[slice_id + 1]; i += WARP_SIZE * BLOCKY) t += aval[i] * x[acolidx[i]];
154: }
155: #pragma unroll
156: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
157: /* transpose layout to reduce each row using warp shfl */
158: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
159: __syncthreads();
160: if (tidy < sliceheight) t = shared[tidy][tidx];
161: #pragma unroll
162: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
163: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
164: __syncthreads();
165: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
166: }
168: template <int BLOCKY>
169: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
170: {
171: bool head = true;
172: #pragma unroll
173: for (int i = 1; i < BLOCKY * 2; i <<= 1) {
174: int halfwarpid = threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2);
175: shared[threadIdx.x + threadIdx.y * WARP_SIZE] = 0;
176: if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
177: shared[threadIdx.x + threadIdx.y * WARP_SIZE] = *val;
178: if (i == 1) head = false;
179: }
180: __syncthreads();
181: if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * WARP_SIZE + i * WARP_SIZE];
182: __syncthreads();
183: }
184: return head;
185: }
187: /* load-balancing version. Chunksize is equal to the number of threads per block */
188: template <int BLOCKY>
189: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
190: {
191: __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
192: PetscInt gid, row, start_slice, cid;
193: PetscScalar t = 0.0;
194: AtomicAdd<MatScalar> atomAdd;
195: /* zero out y */
196: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
197: gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
198: if (gid < nrows) y[gid] = 0.0;
199: }
200: for (int iter = 0; iter < chunksperblock; iter++) {
201: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
202: if (cid < totalchunks) {
203: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
204: gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
205: if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
206: __shared__ PetscInt flag[BLOCKY * 2];
207: bool write;
208: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
209: /* find out the slice that this element belongs to */
210: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
211: if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
212: row = slice_id * sliceheight + threadIdx.x % sliceheight;
213: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
214: __syncthreads();
215: write = segment_scan<BLOCKY>(flag, shared, &t);
216: if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
217: t = 0.0;
218: } else { /* this iteration covers only one slice */
219: row = start_slice * sliceheight + threadIdx.x % sliceheight;
220: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
221: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
222: int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
223: /* reduction and write to output vector */
224: #pragma unroll
225: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
226: /* transpose layout to reduce each row using warp shfl */
227: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
228: __syncthreads();
229: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
230: #pragma unroll
231: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
232: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
233: __syncthreads();
234: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
235: t = 0.0;
236: }
237: }
238: }
239: }
240: }
242: /* load-balancing version. Chunksize is equal to the number of threads per block */
243: template <int BLOCKY>
244: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
245: {
246: __shared__ MatScalar shared[BLOCKY * WARP_SIZE];
247: PetscInt gid, row, start_slice, cid;
248: PetscScalar t = 0.0;
249: AtomicAdd<MatScalar> atomAdd;
250: /* copy y to z */
251: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * WARP_SIZE * BLOCKY); iter++) {
252: gid = gridDim.x * WARP_SIZE * BLOCKY * iter + blockIdx.x * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
253: if (gid < nrows) z[gid] = y[gid];
254: }
255: for (int iter = 0; iter < chunksperblock; iter++) {
256: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
257: if (cid < totalchunks) {
258: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
259: gid = cid * BLOCKY * WARP_SIZE + threadIdx.y * WARP_SIZE + threadIdx.x;
260: if ((cid + 1) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
261: __shared__ PetscInt flag[BLOCKY * 2];
262: bool write;
263: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
264: /* find out the slice that this element belongs to */
265: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
266: if (threadIdx.x % (WARP_SIZE / 2) == 0) flag[threadIdx.y * 2 + threadIdx.x / (WARP_SIZE / 2)] = slice_id;
267: row = slice_id * sliceheight + threadIdx.x % sliceheight;
268: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
269: __syncthreads();
270: write = segment_scan<BLOCKY>(flag, shared, &t);
271: if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
272: t = 0.0;
273: } else { /* this iteration covers only one slice */
274: row = start_slice * sliceheight + threadIdx.x % sliceheight;
275: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
276: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * WARP_SIZE > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
277: int tid = threadIdx.x + threadIdx.y * WARP_SIZE, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
278: /* reduction and write to output vector */
279: #pragma unroll
280: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
281: /* transpose layout to reduce each row using warp shfl */
282: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
283: __syncthreads();
284: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
285: #pragma unroll
286: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down(t, offset, BLOCKY); }
287: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
288: __syncthreads();
289: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
290: t = 0.0;
291: }
292: }
293: }
294: }
295: }
297: /* use 1 warp per slice, suitable for small slice width */
298: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
299: {
300: PetscInt i, row, slice_id;
301: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
302: row = slice_id * sliceheight + threadIdx.x % sliceheight;
303: double t = 0.0;
304: if (row < nrows) {
305: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
306: }
307: #pragma unroll
308: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
309: if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
310: }
312: /* use 1 warp per slice, suitable for small slice width */
313: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
314: {
315: PetscInt i, row, slice_id;
316: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
317: row = slice_id * sliceheight + threadIdx.x % sliceheight;
318: double t = 0.0;
319: if (row < nrows) {
320: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += WARP_SIZE) t += aval[i] * x[acolidx[i]];
321: }
322: #pragma unroll
323: for (int offset = WARP_SIZE / 2; offset >= sliceheight; offset /= 2) { t += __shfl_down(t, offset); }
324: if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
325: }
326: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
327: #endif
329: /*********** Kernel 2-6 require a slice height smaller than 512, 256, 128, 64, 32, espectively. They are kept only for performance comparison **********/
331: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
332: {
333: __shared__ MatScalar shared[32 * 16];
334: PetscInt i, row, slice_id, row_in_slice;
335: /* multiple threads per row. */
336: row = blockIdx.x * blockDim.x + threadIdx.x;
337: if (row < nrows) {
338: slice_id = row / sliceheight;
339: row_in_slice = row % sliceheight;
341: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
342: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
343: __syncthreads();
344: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
345: __syncthreads();
346: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
347: __syncthreads();
348: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
349: __syncthreads();
350: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
351: __syncthreads();
352: if (threadIdx.y < 1) {
353: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
354: y[row] = shared[threadIdx.x];
355: }
356: }
357: }
359: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
360: {
361: __shared__ MatScalar shared[32 * 16];
362: PetscInt i, row, slice_id, row_in_slice;
363: /* multiple threads per row. */
364: row = blockIdx.x * blockDim.x + threadIdx.x;
365: if (row < nrows) {
366: slice_id = row / sliceheight;
367: row_in_slice = row % sliceheight;
369: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
370: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
371: __syncthreads();
372: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
373: __syncthreads();
374: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
375: __syncthreads();
376: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
377: __syncthreads();
378: if (threadIdx.y < 1) {
379: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
380: y[row] = shared[threadIdx.x];
381: }
382: }
383: }
385: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
386: {
387: __shared__ MatScalar shared[32 * 16];
388: PetscInt i, row, slice_id, row_in_slice;
389: /* multiple threads per row. */
390: row = blockIdx.x * blockDim.x + threadIdx.x;
391: if (row < nrows) {
392: slice_id = row / sliceheight;
393: row_in_slice = row % sliceheight;
395: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
396: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
397: __syncthreads();
398: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
399: __syncthreads();
400: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
401: __syncthreads();
402: if (threadIdx.y < 1) {
403: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
404: y[row] = shared[threadIdx.x];
405: }
406: }
407: }
409: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
410: {
411: __shared__ MatScalar shared[32 * 16];
412: PetscInt i, row, slice_id, row_in_slice;
413: /* multiple threads per row. */
414: row = blockIdx.x * blockDim.x + threadIdx.x;
415: if (row < nrows) {
416: slice_id = row / sliceheight;
417: row_in_slice = row % sliceheight;
419: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
420: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
421: __syncthreads();
422: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
423: __syncthreads();
424: if (threadIdx.y < 1) {
425: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
426: y[row] = shared[threadIdx.x];
427: }
428: }
429: }
431: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
432: {
433: __shared__ MatScalar shared[32 * 16];
434: PetscInt i, row, slice_id, row_in_slice;
435: /* multiple threads per row. */
436: row = blockIdx.x * blockDim.x + threadIdx.x;
437: if (row < nrows) {
438: slice_id = row / sliceheight;
439: row_in_slice = row % sliceheight;
441: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
442: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
443: __syncthreads();
444: if (threadIdx.y < 1) {
445: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
446: y[row] = shared[threadIdx.x];
447: }
448: }
449: }
451: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
452: {
453: __shared__ MatScalar shared[32 * 16];
454: PetscInt i, row, slice_id, row_in_slice;
455: /* multiple threads per row. */
456: row = blockIdx.x * blockDim.x + threadIdx.x;
457: if (row < nrows) {
458: slice_id = row / sliceheight;
459: row_in_slice = row % sliceheight;
461: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
462: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
463: __syncthreads();
464: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
465: __syncthreads();
466: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
467: __syncthreads();
468: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
469: __syncthreads();
470: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
471: __syncthreads();
472: if (threadIdx.y < 1) {
473: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
474: z[row] = y[row] + shared[threadIdx.x];
475: }
476: }
477: }
479: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
480: {
481: __shared__ MatScalar shared[32 * 16];
482: PetscInt i, row, slice_id, row_in_slice;
483: /* multiple threads per row. */
484: row = blockIdx.x * blockDim.x + threadIdx.x;
485: if (row < nrows) {
486: slice_id = row / sliceheight;
487: row_in_slice = row % sliceheight;
489: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
490: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
491: __syncthreads();
492: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
493: __syncthreads();
494: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
495: __syncthreads();
496: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
497: __syncthreads();
498: if (threadIdx.y < 1) {
499: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
500: z[row] = y[row] + shared[threadIdx.x];
501: }
502: }
503: }
505: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
506: {
507: __shared__ MatScalar shared[32 * 16];
508: PetscInt i, row, slice_id, row_in_slice;
509: /* multiple threads per row. */
510: row = blockIdx.x * blockDim.x + threadIdx.x;
511: if (row < nrows) {
512: slice_id = row / sliceheight;
513: row_in_slice = row % sliceheight;
515: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
516: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
517: __syncthreads();
518: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
519: __syncthreads();
520: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
521: __syncthreads();
522: if (threadIdx.y < 1) {
523: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
524: z[row] = y[row] + shared[threadIdx.x];
525: }
526: }
527: }
529: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
530: {
531: __shared__ MatScalar shared[32 * 16];
532: PetscInt i, row, slice_id, row_in_slice;
533: /* multiple threads per row. */
534: row = blockIdx.x * blockDim.x + threadIdx.x;
535: if (row < nrows) {
536: slice_id = row / sliceheight;
537: row_in_slice = row % sliceheight;
539: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
540: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
541: __syncthreads();
542: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
543: __syncthreads();
544: if (threadIdx.y < 1) {
545: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
546: z[row] = y[row] + shared[threadIdx.x];
547: }
548: }
549: }
551: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
552: {
553: __shared__ MatScalar shared[32 * 16];
554: PetscInt i, row, slice_id, row_in_slice;
555: /* multiple threads per row. */
556: row = blockIdx.x * blockDim.x + threadIdx.x;
557: if (row < nrows) {
558: slice_id = row / sliceheight;
559: row_in_slice = row % sliceheight;
561: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
562: for (i = sliidx[slice_id] + row_in_slice + sliceheight * threadIdx.y; i < sliidx[slice_id + 1]; i += sliceheight * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
563: __syncthreads();
564: if (threadIdx.y < 1) {
565: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
566: z[row] = y[row] + shared[threadIdx.x];
567: }
568: }
569: }
571: static PetscErrorCode MatMult_SeqSELLHIP(Mat A, Vec xx, Vec yy)
572: {
573: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
574: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
575: PetscScalar *y;
576: const PetscScalar *x;
577: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
578: MatScalar *aval;
579: PetscInt *acolidx;
580: PetscInt *sliidx;
581: PetscInt nblocks, blocksize = 512; /* blocksize is fixed to be 512 */
582: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
583: #if !defined(PETSC_USE_COMPLEX)
584: PetscInt chunksperblock, nchunks, *chunk_slice_map;
585: PetscReal maxoveravg;
586: #endif
588: PetscFunctionBegin;
589: PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
590: PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight > 32), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be less than 32, but the current slice height is %" PetscInt_FMT, sliceheight);
591: PetscCall(MatSeqSELLHIPCopyToGPU(A));
592: /* hipstruct may not be available until MatSeqSELLHIPCopyToGPU() is called */
593: aval = hipstruct->val;
594: acolidx = hipstruct->colidx;
595: sliidx = hipstruct->sliidx;
597: PetscCall(VecHIPGetArrayRead(xx, &x));
598: PetscCall(VecHIPGetArrayWrite(yy, &y));
599: PetscCall(PetscLogGpuTimeBegin());
601: switch (hipstruct->kernelchoice) {
602: #if !defined(PETSC_USE_COMPLEX)
603: case 9: /* 1 slice per block */
604: nblocks = 1 + (nrows - 1) / sliceheight;
605: if (hipstruct->blocky == 2) {
606: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
607: } else if (hipstruct->blocky == 4) {
608: matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
609: } else if (hipstruct->blocky == 8) {
610: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
611: } else if (hipstruct->blocky == 16) {
612: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
613: } else {
614: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
615: }
616: break;
617: case 7: /* each block handles blocky slices */
618: nblocks = 1 + (nrows - 1) / (hipstruct->blocky * sliceheight);
619: if (hipstruct->blocky == 2) {
620: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
621: } else if (hipstruct->blocky == 4) {
622: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
623: } else if (hipstruct->blocky == 8) {
624: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
625: } else if (hipstruct->blocky == 16) {
626: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
627: } else {
628: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
629: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630: }
631: break;
632: #endif
633: case 6:
634: nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if sliceheight=32 */
635: matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
636: break;
637: case 5:
638: nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if sliceheight=32*/
639: matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
640: break;
641: case 4:
642: nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if sliceheight=32 */
643: matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
644: break;
645: case 3:
646: nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if sliceheight=32 */
647: matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
648: break;
649: case 2: /* 16 slices per block if sliceheight=32 */
650: nblocks = 1 + (nrows - 1) / (blocksize / 2);
651: matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
652: break;
653: case 1: /* 32 slices per block if sliceheight=32 */
654: nblocks = 1 + (nrows - 1) / blocksize;
655: matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656: break;
657: #if !defined(PETSC_USE_COMPLEX)
658: case 0:
659: maxoveravg = a->maxslicewidth / a->avgslicewidth;
660: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661: /* each block handles approximately one slice */
662: PetscInt blocky = a->chunksize / 32;
663: nchunks = hipstruct->totalchunks;
664: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
665: nblocks = 1 + (nchunks - 1) / chunksperblock;
666: chunk_slice_map = hipstruct->chunk_slice_map;
667: if (blocky == 2) {
668: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669: } else if (blocky == 4) {
670: matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671: } else if (blocky == 8) {
672: matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673: } else if (blocky == 16) {
674: matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675: } else {
676: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677: }
678: } else {
679: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680: if (avgslicesize <= 432) {
681: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684: } else {
685: nblocks = 1 + (nrows - 1) / sliceheight;
686: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687: }
688: } else if (avgslicesize <= 2400) {
689: nblocks = 1 + (nrows - 1) / sliceheight;
690: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691: } else {
692: nblocks = 1 + (nrows - 1) / sliceheight;
693: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694: }
695: }
696: break;
697: #endif
698: default:
699: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
700: }
701: PetscCall(PetscLogGpuTimeEnd());
702: PetscCall(VecHIPRestoreArrayRead(xx, &x));
703: PetscCall(VecHIPRestoreArrayWrite(yy, &y));
704: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705: PetscFunctionReturn(PETSC_SUCCESS);
706: }
708: static PetscErrorCode MatMultAdd_SeqSELLHIP(Mat A, Vec xx, Vec yy, Vec zz)
709: {
710: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
711: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
712: PetscScalar *z;
713: const PetscScalar *y, *x;
714: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
715: MatScalar *aval = hipstruct->val;
716: PetscInt *acolidx = hipstruct->colidx;
717: PetscInt *sliidx = hipstruct->sliidx;
718: #if !defined(PETSC_USE_COMPLEX)
719: PetscReal maxoveravg;
720: PetscInt chunksperblock, nchunks, *chunk_slice_map;
721: PetscInt blocky = hipstruct->blocky;
722: #endif
724: PetscFunctionBegin;
725: PetscCheck(WARP_SIZE % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of WARP_SIZE, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726: PetscCheck(!(hipstruct->kernelchoice >= 2 && hipstruct->kernelchoice <= 6 && sliceheight != sliceheight), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727: PetscCall(MatSeqSELLHIPCopyToGPU(A));
728: if (a->nz) {
729: PetscInt nblocks, blocksize = 512;
730: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731: PetscCall(VecHIPGetArrayRead(xx, &x));
732: PetscCall(VecHIPGetArrayRead(yy, &y));
733: PetscCall(VecHIPGetArrayWrite(zz, &z));
734: PetscCall(PetscLogGpuTimeBegin());
736: switch (hipstruct->kernelchoice) {
737: #if !defined(PETSC_USE_COMPLEX)
738: case 9:
739: nblocks = 1 + (nrows - 1) / sliceheight;
740: if (blocky == 2) {
741: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742: } else if (blocky == 4) {
743: matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744: } else if (blocky == 8) {
745: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746: } else if (blocky == 16) {
747: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748: } else {
749: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750: }
751: break;
752: case 8:
753: /* each block handles approximately one slice */
754: nchunks = hipstruct->totalchunks;
755: blocky = a->chunksize / 32;
756: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
757: nblocks = 1 + (nchunks - 1) / chunksperblock;
758: chunk_slice_map = hipstruct->chunk_slice_map;
759: if (blocky == 2) {
760: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
761: } else if (blocky == 4) {
762: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763: } else if (blocky == 8) {
764: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765: } else if (blocky == 16) {
766: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767: } else {
768: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769: }
770: break;
771: case 7:
772: nblocks = 1 + (nrows - 1) / (blocky * sliceheight);
773: if (blocky == 2) {
774: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
775: } else if (blocky == 4) {
776: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
777: } else if (blocky == 8) {
778: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779: } else if (blocky == 16) {
780: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781: } else {
782: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
783: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
784: }
785: break;
786: #endif
787: case 6:
788: nblocks = 1 + (nrows - 1) / (blocksize / 32);
789: matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
790: break;
791: case 5:
792: nblocks = 1 + (nrows - 1) / (blocksize / 16);
793: matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
794: break;
795: case 4:
796: nblocks = 1 + (nrows - 1) / (blocksize / 8);
797: matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
798: break;
799: case 3:
800: nblocks = 1 + (nrows - 1) / (blocksize / 4);
801: matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
802: break;
803: case 2:
804: nblocks = 1 + (nrows - 1) / (blocksize / 2);
805: matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
806: break;
807: case 1:
808: nblocks = 1 + (nrows - 1) / blocksize;
809: matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
810: break;
811: #if !defined(PETSC_USE_COMPLEX)
812: case 0:
813: maxoveravg = a->maxslicewidth / a->avgslicewidth;
814: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
815: /* each block handles approximately one slice */
816: nchunks = hipstruct->totalchunks;
817: blocky = a->chunksize / 32;
818: chunksperblock = hipstruct->chunksperblock ? hipstruct->chunksperblock : 1 + (hipstruct->totalentries / hipstruct->totalslices - 1) / a->chunksize;
819: nblocks = 1 + (nchunks - 1) / chunksperblock;
820: chunk_slice_map = hipstruct->chunk_slice_map;
821: if (blocky == 2) {
822: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
823: } else if (blocky == 4) {
824: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(WARP_SIZE, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
825: } else if (blocky == 8) {
826: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
827: } else if (blocky == 16) {
828: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
829: } else {
830: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
831: }
832: } else {
833: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
834: if (avgslicesize <= 432) {
835: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
836: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
837: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
838: } else {
839: nblocks = 1 + (nrows - 1) / sliceheight;
840: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(WARP_SIZE, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
841: }
842: } else if (avgslicesize <= 2400) {
843: nblocks = 1 + (nrows - 1) / sliceheight;
844: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(WARP_SIZE, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845: } else {
846: nblocks = 1 + (nrows - 1) / sliceheight;
847: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(WARP_SIZE, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848: }
849: }
850: break;
851: #endif
852: default:
853: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLHIP.", hipstruct->kernelchoice);
854: }
855: PetscCall(PetscLogGpuTimeEnd());
856: PetscCall(VecHIPRestoreArrayRead(xx, &x));
857: PetscCall(VecHIPRestoreArrayRead(yy, &y));
858: PetscCall(VecHIPRestoreArrayWrite(zz, &z));
859: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
860: } else {
861: PetscCall(VecCopy(yy, zz));
862: }
863: PetscFunctionReturn(PETSC_SUCCESS);
864: }
866: static PetscErrorCode MatSetFromOptions_SeqSELLHIP(Mat A, PetscOptionItems *PetscOptionsObject)
867: {
868: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
869: PetscInt kernel, blocky;
870: PetscBool flg;
872: PetscFunctionBegin;
873: PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLHIP options");
874: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_blocky", &blocky, &flg));
875: if (flg) {
876: PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
877: hipstruct->blocky = blocky;
878: }
879: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_kernel", &kernel, &flg));
880: if (flg) {
881: PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
882: hipstruct->kernelchoice = kernel;
883: if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_hip_chunksperblock", &hipstruct->chunksperblock, &flg)); }
884: }
885: PetscOptionsHeadEnd();
886: PetscFunctionReturn(PETSC_SUCCESS);
887: }
889: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
890: {
891: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
893: PetscFunctionBegin;
894: PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
895: PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
896: PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
897: PetscFunctionReturn(PETSC_SUCCESS);
898: }
900: static PetscErrorCode MatAssemblyEnd_SeqSELLHIP(Mat A, MatAssemblyType mode)
901: {
902: PetscFunctionBegin;
903: PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
904: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
905: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
906: if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLHIPCopyToGPU(A)); }
907: A->ops->mult = MatMult_SeqSELLHIP;
908: A->ops->multadd = MatMultAdd_SeqSELLHIP;
909: PetscFunctionReturn(PETSC_SUCCESS);
910: }
912: static PetscErrorCode MatZeroEntries_SeqSELLHIP(Mat A)
913: {
914: PetscBool both = PETSC_FALSE;
915: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
917: PetscFunctionBegin;
918: if (A->factortype == MAT_FACTOR_NONE) {
919: Mat_SeqSELLHIP *hipstruct = (Mat_SeqSELLHIP *)A->spptr;
920: if (hipstruct->val) {
921: both = PETSC_TRUE;
922: PetscCallHIP(hipMemset(hipstruct->val, 0, a->sliidx[a->totalslices] * sizeof(*hipstruct->val)));
923: }
924: }
925: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
926: PetscCall(MatSeqSELLInvalidateDiagonal(A));
927: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
928: else A->offloadmask = PETSC_OFFLOAD_CPU;
929: PetscFunctionReturn(PETSC_SUCCESS);
930: }
932: static PetscErrorCode MatDestroy_SeqSELLHIP(Mat A)
933: {
934: PetscFunctionBegin;
935: if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLHIP_Destroy((Mat_SeqSELLHIP **)&A->spptr));
936: PetscCall(MatDestroy_SeqSELL(A));
937: PetscFunctionReturn(PETSC_SUCCESS);
938: }
940: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
941: static PetscErrorCode MatDuplicate_SeqSELLHIP(Mat A, MatDuplicateOption cpvalues, Mat *B)
942: {
943: PetscFunctionBegin;
944: PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
945: PetscCall(MatConvert_SeqSELL_SeqSELLHIP(*B));
946: PetscFunctionReturn(PETSC_SUCCESS);
947: }
949: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat B)
950: {
951: Mat_SeqSELLHIP *hipstruct;
953: PetscFunctionBegin;
954: PetscCall(PetscFree(B->defaultvectype));
955: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
957: if (!B->spptr) {
958: if (B->factortype == MAT_FACTOR_NONE) {
959: PetscCall(PetscNew(&hipstruct));
960: B->spptr = hipstruct;
961: }
962: }
964: B->ops->assemblyend = MatAssemblyEnd_SeqSELLHIP;
965: B->ops->destroy = MatDestroy_SeqSELLHIP;
966: B->ops->setfromoptions = MatSetFromOptions_SeqSELLHIP;
967: B->ops->mult = MatMult_SeqSELLHIP;
968: B->ops->multadd = MatMultAdd_SeqSELLHIP;
969: B->ops->duplicate = MatDuplicate_SeqSELLHIP;
970: B->ops->zeroentries = MatZeroEntries_SeqSELLHIP;
972: /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
973: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
975: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLHIP));
976: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
977: PetscFunctionReturn(PETSC_SUCCESS);
978: }
980: /*MC
981: MATSEQSELLHIP - MATSELLHIP = "(seq)sellhip" - A matrix type to be used for sparse matrices on AMD GPUs.
983: Options Database Keys:
984: + -mat_type seqsellhip - sets the matrix type to "seqsellhip" during a call to `MatSetFromOptions()`
985: . -mat_sell_spmv_hip_kernel - selects a spmv kernel for MatSELLHIP
986: - -mat_sell_spmv_hip_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension equal to the wrap size (normally 64 for AMD GPUs)
988: Level: beginner
990: .seealso: [](ch_matrices), `Mat`, `MATSELLHIP`
991: M*/
993: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLHIP(Mat B)
994: {
995: PetscFunctionBegin;
996: PetscCall(MatCreate_SeqSELL(B));
997: PetscCall(MatConvert_SeqSELL_SeqSELLHIP(B));
998: PetscCall(MatSetFromOptions(B));
999: PetscFunctionReturn(PETSC_SUCCESS);
1000: }