Actual source code: sellcuda.cu
1: #include <cuda_runtime.h>
3: #include <petscdevice_cuda.h>
4: #include <petsc/private/cupmatomics.hpp>
5: #include <../src/mat/impls/sell/seq/sell.h>
7: #define SLICE_HEIGHT 16
9: typedef struct {
10: PetscInt maxallocmat;
11: PetscInt totalentries;
12: PetscInt *colidx; /* column index array, device pointer */
13: MatScalar *val; /* value array, device pointer */
14: PetscInt totalslices;
15: PetscInt *sliidx; /* slice index array, device pointer */
16: PetscInt nonzerostate;
17: PetscInt kernelchoice;
18: PetscInt blocky;
19: PetscInt chunksperblock;
20: PetscInt totalchunks;
21: PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
22: } Mat_SeqSELLCUDA;
24: static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
25: {
26: PetscFunctionBegin;
27: if (*cudastruct) {
28: if ((*cudastruct)->colidx) { PetscCallCUDA(cudaFree((*cudastruct)->colidx)); }
29: if ((*cudastruct)->val) { PetscCallCUDA(cudaFree((*cudastruct)->val)); }
30: if ((*cudastruct)->sliidx) { PetscCallCUDA(cudaFree((*cudastruct)->sliidx)); }
31: if ((*cudastruct)->chunk_slice_map) { PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map)); }
32: PetscCall(PetscFree(*cudastruct));
33: }
34: PetscFunctionReturn(PETSC_SUCCESS);
35: }
37: static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
38: {
39: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
40: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
42: PetscFunctionBegin;
43: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
44: PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
45: if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
46: /* copy values only */
47: PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
48: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
49: } else {
50: if (cudastruct->colidx) { PetscCallCUDA(cudaFree(cudastruct->colidx)); }
51: if (cudastruct->val) { PetscCallCUDA(cudaFree(cudastruct->val)); }
52: if (cudastruct->sliidx) { PetscCallCUDA(cudaFree(cudastruct->sliidx)); }
53: if (cudastruct->chunk_slice_map) { PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map)); }
54: cudastruct->maxallocmat = a->maxallocmat;
55: cudastruct->totalentries = a->sliidx[a->totalslices];
56: cudastruct->totalslices = a->totalslices;
57: cudastruct->totalchunks = a->totalchunks;
58: PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
59: PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
60: /* copy values, nz or maxallocmat? */
61: PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
62: PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
64: PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
65: PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
66: PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
67: PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
68: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
69: }
70: PetscCallCUDA(WaitForCUDA());
71: PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
72: A->offloadmask = PETSC_OFFLOAD_BOTH;
73: }
74: PetscFunctionReturn(PETSC_SUCCESS);
75: }
77: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
78: {
79: PetscInt i, row, slice_id, row_in_slice;
80: MatScalar sum;
81: /* one thread per row. */
82: row = blockIdx.x * blockDim.x + threadIdx.x;
83: if (row < nrows) {
84: slice_id = row / sliceheight;
85: row_in_slice = row % sliceheight;
86: sum = 0.0;
87: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
88: y[row] = sum;
89: }
90: }
92: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
93: {
94: PetscInt i, row, slice_id, row_in_slice;
95: MatScalar sum;
96: /* one thread per row. */
97: row = blockIdx.x * blockDim.x + threadIdx.x;
98: if (row < nrows) {
99: slice_id = row / sliceheight;
100: row_in_slice = row % sliceheight;
101: sum = 0.0;
102: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
103: z[row] = y[row] + sum;
104: }
105: }
107: #if !defined(PETSC_USE_COMPLEX)
108: /* use 1 block per slice, suitable for large slice width */
109: template <int BLOCKY>
110: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
111: {
112: __shared__ MatScalar shared[32][BLOCKY];
113: PetscInt i, row, slice_id = blockIdx.x;
114: int tid = threadIdx.x + threadIdx.y * 32;
115: /* transposed index */
116: int tidx = tid % BLOCKY;
117: int tidy = tid / BLOCKY;
118: PetscScalar t = 0.0;
120: row = slice_id * sliceheight + threadIdx.x % sliceheight;
121: if (row < nrows) {
122: for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
123: }
124: #pragma unroll
125: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
126: /* transpose layout to reduce each row using warp shfl */
127: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
128: __syncthreads();
129: if (tidy < sliceheight) t = shared[tidy][tidx];
130: #pragma unroll
131: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
132: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
133: __syncthreads();
134: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { y[row] = shared[0][threadIdx.x]; }
135: }
137: /* use 1 block per slice, suitable for large slice width */
138: template <int BLOCKY>
139: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
140: {
141: __shared__ MatScalar shared[32][BLOCKY];
142: PetscInt i, row, slice_id = blockIdx.x;
143: int tid = threadIdx.x + threadIdx.y * 32;
144: /* transposed index */
145: int tidx = tid % BLOCKY;
146: int tidy = tid / BLOCKY;
147: PetscScalar t = 0.0;
149: row = slice_id * sliceheight + threadIdx.x % sliceheight;
150: if (row < nrows) {
151: for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
152: }
153: #pragma unroll
154: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
155: /* transpose layout to reduce each row using warp shfl */
156: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
157: __syncthreads();
158: if (tidy < sliceheight) t = shared[tidy][tidx];
159: #pragma unroll
160: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
161: if (tidx == 0 && tidy < sliceheight) { shared[0][tidy] = t; }
162: __syncthreads();
163: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) { z[row] = y[row] + shared[0][threadIdx.x]; }
164: }
166: template <int BLOCKY>
167: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
168: {
169: bool head = true;
170: #pragma unroll
171: for (int i = 1; i < BLOCKY * 2; i <<= 1) {
172: int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16;
173: shared[threadIdx.x + threadIdx.y * 32] = 0;
174: if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
175: shared[threadIdx.x + threadIdx.y * 32] = *val;
176: if (i == 1) head = false;
177: }
178: __syncthreads();
179: if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
180: __syncthreads();
181: }
182: return head;
183: }
185: /* load-balancing version. Chunksize is equal to the number of threads per block */
186: template <int BLOCKY>
187: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
188: {
189: __shared__ MatScalar shared[BLOCKY * 32];
190: PetscInt gid, row, start_slice, cid;
191: PetscScalar t = 0.0;
192: AtomicAdd<MatScalar> atomAdd;
193: /* zero out y */
194: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
195: gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
196: if (gid < nrows) y[gid] = 0.0;
197: }
198: for (int iter = 0; iter < chunksperblock; iter++) {
199: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
200: if (cid < totalchunks) {
201: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
202: gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
203: if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
204: __shared__ PetscInt flag[BLOCKY * 2];
205: bool write;
206: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
207: /* find out the slice that this element belongs to */
208: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
209: if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
210: row = slice_id * sliceheight + threadIdx.x % sliceheight;
211: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
212: __syncthreads();
213: write = segment_scan<BLOCKY>(flag, shared, &t);
214: if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
215: t = 0.0;
216: } else { /* this iteration covers only one slice */
217: row = start_slice * sliceheight + threadIdx.x % sliceheight;
218: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
219: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
220: int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
221: /* reduction and write to output vector */
222: #pragma unroll
223: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
224: /* transpose layout to reduce each row using warp shfl */
225: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
226: __syncthreads();
227: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
228: #pragma unroll
229: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
230: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
231: __syncthreads();
232: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
233: t = 0.0;
234: }
235: }
236: }
237: }
238: }
240: /* load-balancing version. Chunksize is equal to the number of threads per block */
241: template <int BLOCKY>
242: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
243: {
244: __shared__ MatScalar shared[BLOCKY * 32];
245: PetscInt gid, row, start_slice, cid;
246: PetscScalar t = 0.0;
247: AtomicAdd<MatScalar> atomAdd;
248: /* copy y to z */
249: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
250: gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
251: if (gid < nrows) z[gid] = y[gid];
252: }
253: for (int iter = 0; iter < chunksperblock; iter++) {
254: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
255: if (cid < totalchunks) {
256: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
257: gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
258: if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
259: __shared__ PetscInt flag[BLOCKY * 2];
260: bool write;
261: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
262: /* find out the slice that this element belongs to */
263: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
264: if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
265: row = slice_id * sliceheight + threadIdx.x % sliceheight;
266: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
267: __syncthreads();
268: write = segment_scan<BLOCKY>(flag, shared, &t);
269: if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
270: t = 0.0;
271: } else { /* this iteration covers only one slice */
272: row = start_slice * sliceheight + threadIdx.x % sliceheight;
273: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
274: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
275: int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
276: /* reduction and write to output vector */
277: #pragma unroll
278: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
279: /* transpose layout to reduce each row using warp shfl */
280: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
281: __syncthreads();
282: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
283: #pragma unroll
284: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY); }
285: if (tidx == 0 && tidy < sliceheight) { shared[tidy] = t; /* shared[0][tidy] = t */ }
286: __syncthreads();
287: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
288: t = 0.0;
289: }
290: }
291: }
292: }
293: }
295: /* use 1 warp per slice, suitable for small slice width */
296: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
297: {
298: PetscInt i, row, slice_id;
299: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
300: row = slice_id * sliceheight + threadIdx.x % sliceheight;
301: double t = 0.0;
302: if (row < nrows) {
303: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
304: }
305: #pragma unroll
306: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
307: if (row < nrows && threadIdx.x < sliceheight) { y[row] = t; }
308: }
310: /* use 1 warp per slice, suitable for small slice width */
311: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
312: {
313: PetscInt i, row, slice_id;
314: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
315: row = slice_id * sliceheight + threadIdx.x % sliceheight;
316: double t = 0.0;
317: if (row < nrows) {
318: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
319: }
320: #pragma unroll
321: for (int offset = 16; offset >= sliceheight; offset /= 2) { t += __shfl_down_sync(0xffffffff, t, offset); }
322: if (row < nrows && threadIdx.x < sliceheight) { z[row] = y[row] + t; }
323: }
324: #endif
326: /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/
328: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
329: {
330: __shared__ MatScalar shared[512];
331: PetscInt i, row, slice_id, row_in_slice;
332: /* multiple threads per row. */
333: row = blockIdx.x * blockDim.x + threadIdx.x;
334: if (row < nrows) {
335: slice_id = row / SLICE_HEIGHT;
336: row_in_slice = row % SLICE_HEIGHT;
338: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
339: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
340: __syncthreads();
341: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
342: __syncthreads();
343: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
344: __syncthreads();
345: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
346: __syncthreads();
347: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
348: __syncthreads();
349: if (threadIdx.y < 1) {
350: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
351: y[row] = shared[threadIdx.x];
352: }
353: }
354: }
356: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
357: {
358: __shared__ MatScalar shared[512];
359: PetscInt i, row, slice_id, row_in_slice;
360: /* multiple threads per row. */
361: row = blockIdx.x * blockDim.x + threadIdx.x;
362: if (row < nrows) {
363: slice_id = row / SLICE_HEIGHT;
364: row_in_slice = row % SLICE_HEIGHT;
366: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
367: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
368: __syncthreads();
369: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
370: __syncthreads();
371: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
372: __syncthreads();
373: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
374: __syncthreads();
375: if (threadIdx.y < 1) {
376: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
377: y[row] = shared[threadIdx.x];
378: }
379: }
380: }
382: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
383: {
384: __shared__ MatScalar shared[512];
385: PetscInt i, row, slice_id, row_in_slice;
386: /* multiple threads per row. */
387: row = blockIdx.x * blockDim.x + threadIdx.x;
388: if (row < nrows) {
389: slice_id = row / SLICE_HEIGHT;
390: row_in_slice = row % SLICE_HEIGHT;
392: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
393: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
394: __syncthreads();
395: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
396: __syncthreads();
397: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
398: __syncthreads();
399: if (threadIdx.y < 1) {
400: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
401: y[row] = shared[threadIdx.x];
402: }
403: }
404: }
406: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
407: {
408: __shared__ MatScalar shared[512];
409: PetscInt i, row, slice_id, row_in_slice;
410: /* multiple threads per row. */
411: row = blockIdx.x * blockDim.x + threadIdx.x;
412: if (row < nrows) {
413: slice_id = row / SLICE_HEIGHT;
414: row_in_slice = row % SLICE_HEIGHT;
416: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
417: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
418: __syncthreads();
419: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
420: __syncthreads();
421: if (threadIdx.y < 1) {
422: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
423: y[row] = shared[threadIdx.x];
424: }
425: }
426: }
428: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
429: {
430: __shared__ MatScalar shared[512];
431: PetscInt i, row, slice_id, row_in_slice;
432: /* multiple threads per row. */
433: row = blockIdx.x * blockDim.x + threadIdx.x;
434: if (row < nrows) {
435: slice_id = row / SLICE_HEIGHT;
436: row_in_slice = row % SLICE_HEIGHT;
438: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
439: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
440: __syncthreads();
441: if (threadIdx.y < 1) {
442: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
443: y[row] = shared[threadIdx.x];
444: }
445: }
446: }
448: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
449: {
450: __shared__ MatScalar shared[512];
451: PetscInt i, row, slice_id, row_in_slice;
452: /* multiple threads per row. */
453: row = blockIdx.x * blockDim.x + threadIdx.x;
454: if (row < nrows) {
455: slice_id = row / SLICE_HEIGHT;
456: row_in_slice = row % SLICE_HEIGHT;
458: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
459: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
460: __syncthreads();
461: if (threadIdx.y < 16) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x]; }
462: __syncthreads();
463: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
464: __syncthreads();
465: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
466: __syncthreads();
467: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
468: __syncthreads();
469: if (threadIdx.y < 1) {
470: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
471: z[row] = y[row] + shared[threadIdx.x];
472: }
473: }
474: }
476: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
477: {
478: __shared__ MatScalar shared[512];
479: PetscInt i, row, slice_id, row_in_slice;
480: /* multiple threads per row. */
481: row = blockIdx.x * blockDim.x + threadIdx.x;
482: if (row < nrows) {
483: slice_id = row / SLICE_HEIGHT;
484: row_in_slice = row % SLICE_HEIGHT;
486: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
487: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
488: __syncthreads();
489: if (threadIdx.y < 8) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x]; }
490: __syncthreads();
491: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
492: __syncthreads();
493: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
494: __syncthreads();
495: if (threadIdx.y < 1) {
496: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
497: z[row] = y[row] + shared[threadIdx.x];
498: }
499: }
500: }
502: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
503: {
504: __shared__ MatScalar shared[512];
505: PetscInt i, row, slice_id, row_in_slice;
506: /* multiple threads per row. */
507: row = blockIdx.x * blockDim.x + threadIdx.x;
508: if (row < nrows) {
509: slice_id = row / SLICE_HEIGHT;
510: row_in_slice = row % SLICE_HEIGHT;
512: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
513: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
514: __syncthreads();
515: if (threadIdx.y < 4) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x]; }
516: __syncthreads();
517: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
518: __syncthreads();
519: if (threadIdx.y < 1) {
520: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
521: z[row] = y[row] + shared[threadIdx.x];
522: }
523: }
524: }
526: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
527: {
528: __shared__ MatScalar shared[512];
529: PetscInt i, row, slice_id, row_in_slice;
530: /* multiple threads per row. */
531: row = blockIdx.x * blockDim.x + threadIdx.x;
532: if (row < nrows) {
533: slice_id = row / SLICE_HEIGHT;
534: row_in_slice = row % SLICE_HEIGHT;
536: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
537: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
538: __syncthreads();
539: if (threadIdx.y < 2) { shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x]; }
540: __syncthreads();
541: if (threadIdx.y < 1) {
542: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
543: z[row] = y[row] + shared[threadIdx.x];
544: }
545: }
546: }
548: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
549: {
550: __shared__ MatScalar shared[512];
551: PetscInt i, row, slice_id, row_in_slice;
552: /* multiple threads per row. */
553: row = blockIdx.x * blockDim.x + threadIdx.x;
554: if (row < nrows) {
555: slice_id = row / SLICE_HEIGHT;
556: row_in_slice = row % SLICE_HEIGHT;
558: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
559: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
560: __syncthreads();
561: if (threadIdx.y < 1) {
562: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
563: z[row] = y[row] + shared[threadIdx.x];
564: }
565: }
566: }
568: static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
569: {
570: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
571: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
572: PetscScalar *y;
573: const PetscScalar *x;
574: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
575: MatScalar *aval;
576: PetscInt *acolidx;
577: PetscInt *sliidx;
578: PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
579: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
580: #if !defined(PETSC_USE_COMPLEX)
581: PetscInt chunksperblock, nchunks, *chunk_slice_map;
582: PetscReal maxoveravg;
583: #endif
585: PetscFunctionBegin;
586: PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
587: PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
588: PetscCall(MatSeqSELLCUDACopyToGPU(A));
589: /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
590: aval = cudastruct->val;
591: acolidx = cudastruct->colidx;
592: sliidx = cudastruct->sliidx;
594: PetscCall(VecCUDAGetArrayRead(xx, &x));
595: PetscCall(VecCUDAGetArrayWrite(yy, &y));
596: PetscCall(PetscLogGpuTimeBegin());
598: switch (cudastruct->kernelchoice) {
599: #if !defined(PETSC_USE_COMPLEX)
600: case 9:
601: nblocks = 1 + (nrows - 1) / sliceheight;
602: if (cudastruct->blocky == 2) {
603: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604: } else if (cudastruct->blocky == 4) {
605: matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606: } else if (cudastruct->blocky == 8) {
607: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608: } else if (cudastruct->blocky == 16) {
609: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610: } else if (cudastruct->blocky == 32) {
611: matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612: } else {
613: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
614: }
615: break;
616: case 7:
617: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
618: if (cudastruct->blocky == 2) {
619: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620: } else if (cudastruct->blocky == 4) {
621: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622: } else if (cudastruct->blocky == 8) {
623: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624: } else if (cudastruct->blocky == 16) {
625: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626: } else if (cudastruct->blocky == 32) {
627: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628: } else {
629: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
630: }
631: break;
632: #endif
633: case 6:
634: nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
635: matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
636: break;
637: case 5:
638: nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
639: matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
640: break;
641: case 4:
642: nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
643: matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
644: break;
645: case 3:
646: nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
647: matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
648: break;
649: case 2: /* 16 slices per block if blocksize=512 */
650: nblocks = 1 + (nrows - 1) / (blocksize / 2);
651: matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
652: break;
653: case 1: /* 32 slices per block if blocksize=512 */
654: nblocks = 1 + (nrows - 1) / blocksize;
655: matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
656: break;
657: #if !defined(PETSC_USE_COMPLEX)
658: case 0:
659: maxoveravg = a->maxslicewidth / a->avgslicewidth;
660: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
661: /* each block handles approximately one slice */
662: PetscInt blocky = a->chunksize / 32;
663: nchunks = cudastruct->totalchunks;
664: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
665: nblocks = 1 + (nchunks - 1) / chunksperblock;
666: chunk_slice_map = cudastruct->chunk_slice_map;
667: if (blocky == 2) {
668: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669: } else if (blocky == 4) {
670: matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671: } else if (blocky == 8) {
672: matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673: } else if (blocky == 16) {
674: matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675: } else if (blocky == 32) {
676: matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677: } else {
678: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
679: }
680: } else {
681: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
682: if (avgslicesize <= 432) {
683: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
684: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
685: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
686: } else {
687: nblocks = 1 + (nrows - 1) / sliceheight;
688: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
689: }
690: } else if (avgslicesize <= 2400) {
691: nblocks = 1 + (nrows - 1) / sliceheight;
692: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
693: } else {
694: nblocks = 1 + (nrows - 1) / sliceheight;
695: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
696: }
697: }
698: break;
699: #endif
700: default:
701: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
702: }
703: PetscCall(PetscLogGpuTimeEnd());
704: PetscCall(VecCUDARestoreArrayRead(xx, &x));
705: PetscCall(VecCUDARestoreArrayWrite(yy, &y));
706: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
707: PetscFunctionReturn(PETSC_SUCCESS);
708: }
710: static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
711: {
712: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
713: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
714: PetscScalar *z;
715: const PetscScalar *y, *x;
716: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
717: MatScalar *aval = cudastruct->val;
718: PetscInt *acolidx = cudastruct->colidx;
719: PetscInt *sliidx = cudastruct->sliidx;
720: #if !defined(PETSC_USE_COMPLEX)
721: PetscReal maxoveravg;
722: PetscInt chunksperblock, nchunks, *chunk_slice_map;
723: PetscInt blocky = cudastruct->blocky;
724: #endif
726: PetscFunctionBegin;
727: PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
728: PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
729: PetscCall(MatSeqSELLCUDACopyToGPU(A));
730: if (a->nz) {
731: PetscInt nblocks, blocksize = 512;
732: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
733: PetscCall(VecCUDAGetArrayRead(xx, &x));
734: PetscCall(VecCUDAGetArrayRead(yy, &y));
735: PetscCall(VecCUDAGetArrayWrite(zz, &z));
736: PetscCall(PetscLogGpuTimeBegin());
738: switch (cudastruct->kernelchoice) {
739: #if !defined(PETSC_USE_COMPLEX)
740: case 9:
741: nblocks = 1 + (nrows - 1) / sliceheight;
742: if (blocky == 2) {
743: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744: } else if (blocky == 4) {
745: matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746: } else if (blocky == 8) {
747: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748: } else if (blocky == 16) {
749: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750: } else if (blocky == 32) {
751: matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752: } else {
753: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
754: }
755: break;
756: case 8:
757: /* each block handles approximately one slice */
758: nchunks = cudastruct->totalchunks;
759: blocky = a->chunksize / 32;
760: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
761: nblocks = 1 + (nchunks - 1) / chunksperblock;
762: chunk_slice_map = cudastruct->chunk_slice_map;
763: if (blocky == 2) {
764: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765: } else if (blocky == 4) {
766: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767: } else if (blocky == 8) {
768: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769: } else if (blocky == 16) {
770: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771: } else if (blocky == 32) {
772: matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773: } else {
774: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
775: }
776: break;
777: case 7:
778: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
779: if (blocky == 2) {
780: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781: } else if (blocky == 4) {
782: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783: } else if (blocky == 8) {
784: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785: } else if (blocky == 16) {
786: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787: } else if (blocky == 32) {
788: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789: } else {
790: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
791: }
792: break;
793: #endif
794: case 6:
795: nblocks = 1 + (nrows - 1) / (blocksize / 32);
796: matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
797: break;
798: case 5:
799: nblocks = 1 + (nrows - 1) / (blocksize / 16);
800: matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
801: break;
802: case 4:
803: nblocks = 1 + (nrows - 1) / (blocksize / 8);
804: matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
805: break;
806: case 3:
807: nblocks = 1 + (nrows - 1) / (blocksize / 4);
808: matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
809: break;
810: case 2:
811: nblocks = 1 + (nrows - 1) / (blocksize / 2);
812: matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
813: break;
814: case 1:
815: nblocks = 1 + (nrows - 1) / blocksize;
816: matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
817: break;
818: #if !defined(PETSC_USE_COMPLEX)
819: case 0:
820: maxoveravg = a->maxslicewidth / a->avgslicewidth;
821: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
822: /* each block handles approximately one slice */
823: nchunks = cudastruct->totalchunks;
824: blocky = a->chunksize / 32;
825: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
826: nblocks = 1 + (nchunks - 1) / chunksperblock;
827: chunk_slice_map = cudastruct->chunk_slice_map;
828: if (blocky == 2) {
829: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830: } else if (blocky == 4) {
831: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832: } else if (blocky == 8) {
833: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834: } else if (blocky == 16) {
835: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836: } else if (blocky == 32) {
837: matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838: } else {
839: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
840: }
841: } else {
842: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
843: if (avgslicesize <= 432) {
844: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
845: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
846: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
847: } else {
848: nblocks = 1 + (nrows - 1) / sliceheight;
849: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
850: }
851: } else if (avgslicesize <= 2400) {
852: nblocks = 1 + (nrows - 1) / sliceheight;
853: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
854: } else {
855: nblocks = 1 + (nrows - 1) / sliceheight;
856: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
857: }
858: }
859: break;
860: #endif
861: default:
862: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
863: }
864: PetscCall(PetscLogGpuTimeEnd());
865: PetscCall(VecCUDARestoreArrayRead(xx, &x));
866: PetscCall(VecCUDARestoreArrayRead(yy, &y));
867: PetscCall(VecCUDARestoreArrayWrite(zz, &z));
868: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
869: } else {
870: PetscCall(VecCopy(yy, zz));
871: }
872: PetscFunctionReturn(PETSC_SUCCESS);
873: }
875: static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems *PetscOptionsObject)
876: {
877: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
878: PetscInt kernel, blocky;
879: PetscBool flg;
881: PetscFunctionBegin;
882: PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
883: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
884: if (flg) {
885: PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
886: cudastruct->blocky = blocky;
887: }
888: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
889: if (flg) {
890: PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
891: cudastruct->kernelchoice = kernel;
892: if (kernel == 8) { PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg)); }
893: }
894: PetscOptionsHeadEnd();
895: PetscFunctionReturn(PETSC_SUCCESS);
896: }
898: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
899: {
900: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
902: PetscFunctionBegin;
903: PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
904: PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
905: PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
906: PetscFunctionReturn(PETSC_SUCCESS);
907: }
909: static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
910: {
911: PetscFunctionBegin;
912: PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
913: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
914: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
915: if (A->factortype == MAT_FACTOR_NONE) { PetscCall(MatSeqSELLCUDACopyToGPU(A)); }
916: A->ops->mult = MatMult_SeqSELLCUDA;
917: A->ops->multadd = MatMultAdd_SeqSELLCUDA;
918: PetscFunctionReturn(PETSC_SUCCESS);
919: }
921: static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
922: {
923: PetscBool both = PETSC_FALSE;
924: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
926: PetscFunctionBegin;
927: if (A->factortype == MAT_FACTOR_NONE) {
928: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
929: if (cudastruct->val) {
930: both = PETSC_TRUE;
931: PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
932: }
933: }
934: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
935: PetscCall(MatSeqSELLInvalidateDiagonal(A));
936: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
937: else A->offloadmask = PETSC_OFFLOAD_CPU;
938: PetscFunctionReturn(PETSC_SUCCESS);
939: }
941: static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
942: {
943: PetscFunctionBegin;
944: if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
945: PetscCall(MatDestroy_SeqSELL(A));
946: PetscFunctionReturn(PETSC_SUCCESS);
947: }
949: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
950: static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
951: {
952: PetscFunctionBegin;
953: PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
954: PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
955: PetscFunctionReturn(PETSC_SUCCESS);
956: }
958: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
959: {
960: Mat_SeqSELLCUDA *cudastruct;
962: PetscFunctionBegin;
963: PetscCall(PetscFree(B->defaultvectype));
964: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
966: if (!B->spptr) {
967: if (B->factortype == MAT_FACTOR_NONE) {
968: PetscCall(PetscNew(&cudastruct));
969: B->spptr = cudastruct;
970: }
971: }
973: B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA;
974: B->ops->destroy = MatDestroy_SeqSELLCUDA;
975: B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
976: B->ops->mult = MatMult_SeqSELLCUDA;
977: B->ops->multadd = MatMultAdd_SeqSELLCUDA;
978: B->ops->duplicate = MatDuplicate_SeqSELLCUDA;
979: B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA;
981: /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
982: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
984: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
985: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
986: PetscFunctionReturn(PETSC_SUCCESS);
987: }
989: /*MC
990: MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
992: Options Database Keys:
993: + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
994: . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
995: - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
997: Level: beginner
999: .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
1000: M*/
1002: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1003: {
1004: PetscFunctionBegin;
1005: PetscCall(MatCreate_SeqSELL(B));
1006: PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1007: PetscCall(MatSetFromOptions(B));
1008: PetscFunctionReturn(PETSC_SUCCESS);
1009: }