Actual source code: sellcuda.cu
1: #include <petscdevice_cuda.h>
2: #include <petsc/private/cupmatomics.hpp>
3: #include <../src/mat/impls/sell/seq/sell.h>
5: #define SLICE_HEIGHT 16
7: typedef struct {
8: PetscInt maxallocmat;
9: PetscInt totalentries;
10: PetscInt *colidx; /* column index array, device pointer */
11: MatScalar *val; /* value array, device pointer */
12: PetscInt totalslices;
13: PetscInt *sliidx; /* slice index array, device pointer */
14: PetscInt nonzerostate;
15: PetscInt kernelchoice;
16: PetscInt blocky;
17: PetscInt chunksperblock;
18: PetscInt totalchunks;
19: PetscInt *chunk_slice_map; /* starting slice for each chunk, device pointer */
20: } Mat_SeqSELLCUDA;
22: static PetscErrorCode MatSeqSELLCUDA_Destroy(Mat_SeqSELLCUDA **cudastruct)
23: {
24: PetscFunctionBegin;
25: if (*cudastruct) {
26: if ((*cudastruct)->colidx) PetscCallCUDA(cudaFree((*cudastruct)->colidx));
27: if ((*cudastruct)->val) PetscCallCUDA(cudaFree((*cudastruct)->val));
28: if ((*cudastruct)->sliidx) PetscCallCUDA(cudaFree((*cudastruct)->sliidx));
29: if ((*cudastruct)->chunk_slice_map) PetscCallCUDA(cudaFree((*cudastruct)->chunk_slice_map));
30: PetscCall(PetscFree(*cudastruct));
31: }
32: PetscFunctionReturn(PETSC_SUCCESS);
33: }
35: static PetscErrorCode MatSeqSELLCUDACopyToGPU(Mat A)
36: {
37: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
38: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
40: PetscFunctionBegin;
41: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
42: PetscCall(PetscLogEventBegin(MAT_CUDACopyToGPU, A, 0, 0, 0));
43: if (A->assembled && A->nonzerostate == cudastruct->nonzerostate) {
44: /* copy values only */
45: PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(MatScalar), cudaMemcpyHostToDevice));
46: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar))));
47: } else {
48: if (cudastruct->colidx) PetscCallCUDA(cudaFree(cudastruct->colidx));
49: if (cudastruct->val) PetscCallCUDA(cudaFree(cudastruct->val));
50: if (cudastruct->sliidx) PetscCallCUDA(cudaFree(cudastruct->sliidx));
51: if (cudastruct->chunk_slice_map) PetscCallCUDA(cudaFree(cudastruct->chunk_slice_map));
52: cudastruct->maxallocmat = a->maxallocmat;
53: cudastruct->totalentries = a->sliidx[a->totalslices];
54: cudastruct->totalslices = a->totalslices;
55: cudastruct->totalchunks = a->totalchunks;
56: PetscCallCUDA(cudaMalloc((void **)&cudastruct->colidx, a->maxallocmat * sizeof(*cudastruct->colidx)));
57: PetscCallCUDA(cudaMalloc((void **)&cudastruct->val, a->maxallocmat * sizeof(*cudastruct->val)));
58: /* copy values, nz or maxallocmat? */
59: PetscCallCUDA(cudaMemcpy(cudastruct->colidx, a->colidx, a->sliidx[a->totalslices] * sizeof(*a->colidx), cudaMemcpyHostToDevice));
60: PetscCallCUDA(cudaMemcpy(cudastruct->val, a->val, a->sliidx[a->totalslices] * sizeof(*a->val), cudaMemcpyHostToDevice));
62: PetscCallCUDA(cudaMalloc((void **)&cudastruct->sliidx, (a->totalslices + 1) * sizeof(*cudastruct->sliidx)));
63: PetscCallCUDA(cudaMemcpy(cudastruct->sliidx, a->sliidx, (a->totalslices + 1) * sizeof(*a->sliidx), cudaMemcpyHostToDevice));
64: PetscCallCUDA(cudaMalloc((void **)&cudastruct->chunk_slice_map, a->totalchunks * sizeof(*cudastruct->chunk_slice_map)));
65: PetscCallCUDA(cudaMemcpy(cudastruct->chunk_slice_map, a->chunk_slice_map, a->totalchunks * sizeof(*a->chunk_slice_map), cudaMemcpyHostToDevice));
66: PetscCall(PetscLogCpuToGpu(a->sliidx[a->totalslices] * (sizeof(MatScalar) + sizeof(PetscInt)) + (a->totalslices + 1 + a->totalchunks) * sizeof(PetscInt)));
67: }
68: PetscCallCUDA(WaitForCUDA());
69: PetscCall(PetscLogEventEnd(MAT_CUDACopyToGPU, A, 0, 0, 0));
70: A->offloadmask = PETSC_OFFLOAD_BOTH;
71: }
72: PetscFunctionReturn(PETSC_SUCCESS);
73: }
75: static __global__ void matmult_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
76: {
77: PetscInt i, row, slice_id, row_in_slice;
78: MatScalar sum;
79: /* one thread per row. */
80: row = blockIdx.x * blockDim.x + threadIdx.x;
81: if (row < nrows) {
82: slice_id = row / sliceheight;
83: row_in_slice = row % sliceheight;
84: sum = 0.0;
85: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
86: y[row] = sum;
87: }
88: }
90: static __global__ void matmultadd_seqsell_basic_kernel(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
91: {
92: PetscInt i, row, slice_id, row_in_slice;
93: MatScalar sum;
94: /* one thread per row. */
95: row = blockIdx.x * blockDim.x + threadIdx.x;
96: if (row < nrows) {
97: slice_id = row / sliceheight;
98: row_in_slice = row % sliceheight;
99: sum = 0.0;
100: for (i = sliidx[slice_id] + row_in_slice; i < sliidx[slice_id + 1]; i += sliceheight) sum += aval[i] * x[acolidx[i]];
101: z[row] = y[row] + sum;
102: }
103: }
105: #if !defined(PETSC_USE_COMPLEX)
106: /* use 1 block per slice, suitable for large slice width */
107: template <int BLOCKY>
108: __global__ void matmult_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
109: {
110: __shared__ MatScalar shared[32][BLOCKY];
111: PetscInt i, row, slice_id = blockIdx.x;
112: int tid = threadIdx.x + threadIdx.y * 32;
113: /* transposed index */
114: int tidx = tid % BLOCKY;
115: int tidy = tid / BLOCKY;
116: PetscScalar t = 0.0;
118: row = slice_id * sliceheight + threadIdx.x % sliceheight;
119: if (row < nrows) {
120: for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
121: }
122: #pragma unroll
123: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
124: /* transpose layout to reduce each row using warp shfl */
125: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
126: __syncthreads();
127: if (tidy < sliceheight) t = shared[tidy][tidx];
128: #pragma unroll
129: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
130: if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
131: __syncthreads();
132: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
133: }
135: /* use 1 block per slice, suitable for large slice width */
136: template <int BLOCKY>
137: __global__ void matmultadd_seqsell_tiled_kernel9(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
138: {
139: __shared__ MatScalar shared[32][BLOCKY];
140: PetscInt i, row, slice_id = blockIdx.x;
141: int tid = threadIdx.x + threadIdx.y * 32;
142: /* transposed index */
143: int tidx = tid % BLOCKY;
144: int tidy = tid / BLOCKY;
145: PetscScalar t = 0.0;
147: row = slice_id * sliceheight + threadIdx.x % sliceheight;
148: if (row < nrows) {
149: for (i = sliidx[slice_id] + threadIdx.x + 32 * threadIdx.y; i < sliidx[slice_id + 1]; i += 32 * BLOCKY) t += aval[i] * x[acolidx[i]];
150: }
151: #pragma unroll
152: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
153: /* transpose layout to reduce each row using warp shfl */
154: if (threadIdx.x < sliceheight) shared[threadIdx.x][threadIdx.y] = t;
155: __syncthreads();
156: if (tidy < sliceheight) t = shared[tidy][tidx];
157: #pragma unroll
158: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
159: if (tidx == 0 && tidy < sliceheight) shared[0][tidy] = t;
160: __syncthreads();
161: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
162: }
164: template <int BLOCKY>
165: __device__ __forceinline__ static bool segment_scan(PetscInt flag[], MatScalar shared[], PetscScalar *val)
166: {
167: bool head = true;
168: #pragma unroll
169: for (int i = 1; i < BLOCKY * 2; i <<= 1) {
170: int halfwarpid = threadIdx.y * 2 + threadIdx.x / 16;
171: shared[threadIdx.x + threadIdx.y * 32] = 0;
172: if (halfwarpid >= i && flag[halfwarpid - i] == flag[halfwarpid]) {
173: shared[threadIdx.x + threadIdx.y * 32] = *val;
174: if (i == 1) head = false;
175: }
176: __syncthreads();
177: if (halfwarpid < BLOCKY * 2 - i) *val += shared[threadIdx.x + threadIdx.y * 32 + i * 16];
178: __syncthreads();
179: }
180: return head;
181: }
183: /* load-balancing version. Chunksize is equal to the number of threads per block */
184: template <int BLOCKY>
185: __global__ void matmult_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
186: {
187: __shared__ MatScalar shared[BLOCKY * 32];
188: PetscInt gid, row, start_slice, cid;
189: PetscScalar t = 0.0;
190: AtomicAdd<MatScalar> atomAdd;
191: /* zero out y */
192: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
193: gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
194: if (gid < nrows) y[gid] = 0.0;
195: }
196: for (int iter = 0; iter < chunksperblock; iter++) {
197: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
198: if (cid < totalchunks) {
199: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
200: gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
201: if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
202: __shared__ PetscInt flag[BLOCKY * 2];
203: bool write;
204: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
205: /* find out the slice that this element belongs to */
206: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
207: if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
208: row = slice_id * sliceheight + threadIdx.x % sliceheight;
209: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
210: __syncthreads();
211: write = segment_scan<BLOCKY>(flag, shared, &t);
212: if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
213: t = 0.0;
214: } else { /* this iteration covers only one slice */
215: row = start_slice * sliceheight + threadIdx.x % sliceheight;
216: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
217: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
218: int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
219: /* reduction and write to output vector */
220: #pragma unroll
221: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
222: /* transpose layout to reduce each row using warp shfl */
223: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
224: __syncthreads();
225: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
226: #pragma unroll
227: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
228: if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
229: __syncthreads();
230: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
231: t = 0.0;
232: }
233: }
234: }
235: }
236: }
238: /* load-balancing version. Chunksize is equal to the number of threads per block */
239: template <int BLOCKY>
240: __global__ void matmultadd_seqsell_tiled_kernel8(PetscInt nrows, PetscInt sliceheight, PetscInt chunksperblock, PetscInt totalchunks, const PetscInt *chunk_slice_map, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
241: {
242: __shared__ MatScalar shared[BLOCKY * 32];
243: PetscInt gid, row, start_slice, cid;
244: PetscScalar t = 0.0;
245: AtomicAdd<MatScalar> atomAdd;
246: /* copy y to z */
247: for (int iter = 0; iter < 1 + (nrows - 1) / (gridDim.x * 32 * BLOCKY); iter++) {
248: gid = gridDim.x * 32 * BLOCKY * iter + blockIdx.x * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
249: if (gid < nrows) z[gid] = y[gid];
250: }
251: for (int iter = 0; iter < chunksperblock; iter++) {
252: cid = blockIdx.x * chunksperblock + iter; /* chunk id */
253: if (cid < totalchunks) {
254: start_slice = chunk_slice_map[cid]; /* starting slice at each iteration */
255: gid = cid * BLOCKY * 32 + threadIdx.y * 32 + threadIdx.x;
256: if ((cid + 1) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* this iteration covers more than one slice */
257: __shared__ PetscInt flag[BLOCKY * 2];
258: bool write;
259: PetscInt slice_id = start_slice, totalslices = PetscCeilIntMacro(nrows, sliceheight), totalentries = sliidx[totalslices];
260: /* find out the slice that this element belongs to */
261: while (gid < totalentries && gid >= sliidx[slice_id + 1]) slice_id++;
262: if (threadIdx.x % 16 == 0) flag[threadIdx.y * 2 + threadIdx.x / 16] = slice_id;
263: row = slice_id * sliceheight + threadIdx.x % sliceheight;
264: if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
265: __syncthreads();
266: write = segment_scan<BLOCKY>(flag, shared, &t);
267: if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
268: t = 0.0;
269: } else { /* this iteration covers only one slice */
270: row = start_slice * sliceheight + threadIdx.x % sliceheight;
271: if (row < nrows) t += aval[gid] * x[acolidx[gid]];
272: if (iter == chunksperblock - 1 || (cid + 2) * BLOCKY * 32 > sliidx[start_slice + 1]) { /* last iteration or next iteration covers more than one slice */
273: int tid = threadIdx.x + threadIdx.y * 32, tidx = tid % BLOCKY, tidy = tid / BLOCKY;
274: /* reduction and write to output vector */
275: #pragma unroll
276: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
277: /* transpose layout to reduce each row using warp shfl */
278: if (threadIdx.x < sliceheight) shared[threadIdx.x * BLOCKY + threadIdx.y] = t; /* shared[threadIdx.x][threadIdx.y] = t */
279: __syncthreads();
280: if (tidy < sliceheight) t = shared[tidy * BLOCKY + tidx]; /* shared[tidy][tidx] */
281: #pragma unroll
282: for (int offset = BLOCKY / 2; offset > 0; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset, BLOCKY);
283: if (tidx == 0 && tidy < sliceheight) shared[tidy] = t; /* shared[0][tidy] = t */
284: __syncthreads();
285: if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
286: t = 0.0;
287: }
288: }
289: }
290: }
291: }
293: /* use 1 warp per slice, suitable for small slice width */
294: static __global__ void matmult_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
295: {
296: PetscInt i, row, slice_id;
297: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
298: row = slice_id * sliceheight + threadIdx.x % sliceheight;
299: double t = 0.0;
300: if (row < nrows) {
301: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
302: }
303: #pragma unroll
304: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
305: if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
306: }
308: /* use 1 warp per slice, suitable for small slice width */
309: static __global__ void matmultadd_seqsell_tiled_kernel7(PetscInt nrows, PetscInt sliceheight, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
310: {
311: PetscInt i, row, slice_id;
312: slice_id = blockIdx.x * blockDim.y + threadIdx.y;
313: row = slice_id * sliceheight + threadIdx.x % sliceheight;
314: double t = 0.0;
315: if (row < nrows) {
316: for (i = sliidx[slice_id] + threadIdx.x; i < sliidx[slice_id + 1]; i += 32) t += aval[i] * x[acolidx[i]];
317: }
318: #pragma unroll
319: for (int offset = 16; offset >= sliceheight; offset /= 2) t += __shfl_down_sync(0xffffffff, t, offset);
320: if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
321: }
322: #endif
324: /*********** Kernel 2-6 are tied to slice height 16. They are kept only for performance comparison **********/
326: static __global__ void matmult_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
327: {
328: __shared__ MatScalar shared[512];
329: PetscInt i, row, slice_id, row_in_slice;
330: /* multiple threads per row. */
331: row = blockIdx.x * blockDim.x + threadIdx.x;
332: if (row < nrows) {
333: slice_id = row / SLICE_HEIGHT;
334: row_in_slice = row % SLICE_HEIGHT;
336: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
337: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
338: __syncthreads();
339: if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
340: __syncthreads();
341: if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
342: __syncthreads();
343: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
344: __syncthreads();
345: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
346: __syncthreads();
347: if (threadIdx.y < 1) {
348: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
349: y[row] = shared[threadIdx.x];
350: }
351: }
352: }
354: static __global__ void matmult_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
355: {
356: __shared__ MatScalar shared[512];
357: PetscInt i, row, slice_id, row_in_slice;
358: /* multiple threads per row. */
359: row = blockIdx.x * blockDim.x + threadIdx.x;
360: if (row < nrows) {
361: slice_id = row / SLICE_HEIGHT;
362: row_in_slice = row % SLICE_HEIGHT;
364: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
365: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
366: __syncthreads();
367: if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
368: __syncthreads();
369: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
370: __syncthreads();
371: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
372: __syncthreads();
373: if (threadIdx.y < 1) {
374: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
375: y[row] = shared[threadIdx.x];
376: }
377: }
378: }
380: static __global__ void matmult_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
381: {
382: __shared__ MatScalar shared[512];
383: PetscInt i, row, slice_id, row_in_slice;
384: /* multiple threads per row. */
385: row = blockIdx.x * blockDim.x + threadIdx.x;
386: if (row < nrows) {
387: slice_id = row / SLICE_HEIGHT;
388: row_in_slice = row % SLICE_HEIGHT;
390: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
391: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
392: __syncthreads();
393: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
394: __syncthreads();
395: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
396: __syncthreads();
397: if (threadIdx.y < 1) {
398: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
399: y[row] = shared[threadIdx.x];
400: }
401: }
402: }
404: static __global__ void matmult_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
405: {
406: __shared__ MatScalar shared[512];
407: PetscInt i, row, slice_id, row_in_slice;
408: /* multiple threads per row. */
409: row = blockIdx.x * blockDim.x + threadIdx.x;
410: if (row < nrows) {
411: slice_id = row / SLICE_HEIGHT;
412: row_in_slice = row % SLICE_HEIGHT;
414: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
415: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
416: __syncthreads();
417: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
418: __syncthreads();
419: if (threadIdx.y < 1) {
420: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
421: y[row] = shared[threadIdx.x];
422: }
423: }
424: }
426: static __global__ void matmult_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, PetscScalar *y)
427: {
428: __shared__ MatScalar shared[512];
429: PetscInt i, row, slice_id, row_in_slice;
430: /* multiple threads per row. */
431: row = blockIdx.x * blockDim.x + threadIdx.x;
432: if (row < nrows) {
433: slice_id = row / SLICE_HEIGHT;
434: row_in_slice = row % SLICE_HEIGHT;
436: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
437: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
438: __syncthreads();
439: if (threadIdx.y < 1) {
440: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
441: y[row] = shared[threadIdx.x];
442: }
443: }
444: }
446: static __global__ void matmultadd_seqsell_tiled_kernel6(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
447: {
448: __shared__ MatScalar shared[512];
449: PetscInt i, row, slice_id, row_in_slice;
450: /* multiple threads per row. */
451: row = blockIdx.x * blockDim.x + threadIdx.x;
452: if (row < nrows) {
453: slice_id = row / SLICE_HEIGHT;
454: row_in_slice = row % SLICE_HEIGHT;
456: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
457: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
458: __syncthreads();
459: if (threadIdx.y < 16) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 16) * blockDim.x + threadIdx.x];
460: __syncthreads();
461: if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
462: __syncthreads();
463: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
464: __syncthreads();
465: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
466: __syncthreads();
467: if (threadIdx.y < 1) {
468: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
469: z[row] = y[row] + shared[threadIdx.x];
470: }
471: }
472: }
474: static __global__ void matmultadd_seqsell_tiled_kernel5(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
475: {
476: __shared__ MatScalar shared[512];
477: PetscInt i, row, slice_id, row_in_slice;
478: /* multiple threads per row. */
479: row = blockIdx.x * blockDim.x + threadIdx.x;
480: if (row < nrows) {
481: slice_id = row / SLICE_HEIGHT;
482: row_in_slice = row % SLICE_HEIGHT;
484: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
485: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
486: __syncthreads();
487: if (threadIdx.y < 8) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 8) * blockDim.x + threadIdx.x];
488: __syncthreads();
489: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
490: __syncthreads();
491: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
492: __syncthreads();
493: if (threadIdx.y < 1) {
494: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
495: z[row] = y[row] + shared[threadIdx.x];
496: }
497: }
498: }
500: static __global__ void matmultadd_seqsell_tiled_kernel4(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
501: {
502: __shared__ MatScalar shared[512];
503: PetscInt i, row, slice_id, row_in_slice;
504: /* multiple threads per row. */
505: row = blockIdx.x * blockDim.x + threadIdx.x;
506: if (row < nrows) {
507: slice_id = row / SLICE_HEIGHT;
508: row_in_slice = row % SLICE_HEIGHT;
510: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
511: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
512: __syncthreads();
513: if (threadIdx.y < 4) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 4) * blockDim.x + threadIdx.x];
514: __syncthreads();
515: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
516: __syncthreads();
517: if (threadIdx.y < 1) {
518: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
519: z[row] = y[row] + shared[threadIdx.x];
520: }
521: }
522: }
524: static __global__ void matmultadd_seqsell_tiled_kernel3(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
525: {
526: __shared__ MatScalar shared[512];
527: PetscInt i, row, slice_id, row_in_slice;
528: /* multiple threads per row. */
529: row = blockIdx.x * blockDim.x + threadIdx.x;
530: if (row < nrows) {
531: slice_id = row / SLICE_HEIGHT;
532: row_in_slice = row % SLICE_HEIGHT;
534: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
535: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
536: __syncthreads();
537: if (threadIdx.y < 2) shared[threadIdx.y * blockDim.x + threadIdx.x] += shared[(threadIdx.y + 2) * blockDim.x + threadIdx.x];
538: __syncthreads();
539: if (threadIdx.y < 1) {
540: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
541: z[row] = y[row] + shared[threadIdx.x];
542: }
543: }
544: }
546: static __global__ void matmultadd_seqsell_tiled_kernel2(PetscInt nrows, const PetscInt *acolidx, const MatScalar *aval, const PetscInt *sliidx, const PetscScalar *x, const PetscScalar *y, PetscScalar *z)
547: {
548: __shared__ MatScalar shared[512];
549: PetscInt i, row, slice_id, row_in_slice;
550: /* multiple threads per row. */
551: row = blockIdx.x * blockDim.x + threadIdx.x;
552: if (row < nrows) {
553: slice_id = row / SLICE_HEIGHT;
554: row_in_slice = row % SLICE_HEIGHT;
556: shared[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;
557: for (i = sliidx[slice_id] + row_in_slice + SLICE_HEIGHT * threadIdx.y; i < sliidx[slice_id + 1]; i += SLICE_HEIGHT * blockDim.y) shared[threadIdx.y * blockDim.x + threadIdx.x] += aval[i] * x[acolidx[i]];
558: __syncthreads();
559: if (threadIdx.y < 1) {
560: shared[threadIdx.x] += shared[blockDim.x + threadIdx.x];
561: z[row] = y[row] + shared[threadIdx.x];
562: }
563: }
564: }
566: static PetscErrorCode MatMult_SeqSELLCUDA(Mat A, Vec xx, Vec yy)
567: {
568: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
569: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
570: PetscScalar *y;
571: const PetscScalar *x;
572: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
573: MatScalar *aval;
574: PetscInt *acolidx;
575: PetscInt *sliidx;
576: PetscInt nblocks, blocksize = 512; /* blocksize must be multiple of SLICE_HEIGHT*32 */
577: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
578: #if !defined(PETSC_USE_COMPLEX)
579: PetscInt chunksperblock, nchunks, *chunk_slice_map;
580: PetscReal maxoveravg;
581: #endif
583: PetscFunctionBegin;
584: PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
585: PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
586: PetscCall(MatSeqSELLCUDACopyToGPU(A));
587: /* cudastruct may not be available until MatSeqSELLCUDACopyToGPU() is called */
588: aval = cudastruct->val;
589: acolidx = cudastruct->colidx;
590: sliidx = cudastruct->sliidx;
592: PetscCall(VecCUDAGetArrayRead(xx, &x));
593: PetscCall(VecCUDAGetArrayWrite(yy, &y));
594: PetscCall(PetscLogGpuTimeBegin());
596: switch (cudastruct->kernelchoice) {
597: #if !defined(PETSC_USE_COMPLEX)
598: case 9:
599: nblocks = 1 + (nrows - 1) / sliceheight;
600: if (cudastruct->blocky == 2) {
601: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
602: } else if (cudastruct->blocky == 4) {
603: matmult_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
604: } else if (cudastruct->blocky == 8) {
605: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
606: } else if (cudastruct->blocky == 16) {
607: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
608: } else if (cudastruct->blocky == 32) {
609: matmult_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
610: } else {
611: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
612: }
613: break;
614: case 7:
615: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
616: if (cudastruct->blocky == 2) {
617: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
618: } else if (cudastruct->blocky == 4) {
619: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
620: } else if (cudastruct->blocky == 8) {
621: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
622: } else if (cudastruct->blocky == 16) {
623: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
624: } else if (cudastruct->blocky == 32) {
625: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
626: } else {
627: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
628: }
629: break;
630: #endif
631: case 6:
632: nblocks = 1 + (nrows - 1) / (blocksize / 32); /* 1 slice per block if blocksize=512 */
633: matmult_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y);
634: break;
635: case 5:
636: nblocks = 1 + (nrows - 1) / (blocksize / 16); /* 2 slices per block if blocksize=512*/
637: matmult_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y);
638: break;
639: case 4:
640: nblocks = 1 + (nrows - 1) / (blocksize / 8); /* 4 slices per block if blocksize=512 */
641: matmult_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y);
642: break;
643: case 3:
644: nblocks = 1 + (nrows - 1) / (blocksize / 4); /* 8 slices per block if blocksize=512 */
645: matmult_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y);
646: break;
647: case 2: /* 16 slices per block if blocksize=512 */
648: nblocks = 1 + (nrows - 1) / (blocksize / 2);
649: matmult_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y);
650: break;
651: case 1: /* 32 slices per block if blocksize=512 */
652: nblocks = 1 + (nrows - 1) / blocksize;
653: matmult_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
654: break;
655: #if !defined(PETSC_USE_COMPLEX)
656: case 0:
657: maxoveravg = a->maxslicewidth / a->avgslicewidth;
658: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
659: /* each block handles approximately one slice */
660: PetscInt blocky = a->chunksize / 32;
661: nchunks = cudastruct->totalchunks;
662: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
663: nblocks = 1 + (nchunks - 1) / chunksperblock;
664: chunk_slice_map = cudastruct->chunk_slice_map;
665: if (blocky == 2) {
666: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
667: } else if (blocky == 4) {
668: matmult_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
669: } else if (blocky == 8) {
670: matmult_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
671: } else if (blocky == 16) {
672: matmult_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
673: } else if (blocky == 32) {
674: matmult_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
675: } else {
676: matmult_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y);
677: }
678: } else {
679: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
680: if (avgslicesize <= 432) {
681: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
682: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
683: matmult_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
684: } else {
685: nblocks = 1 + (nrows - 1) / sliceheight;
686: matmult_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
687: }
688: } else if (avgslicesize <= 2400) {
689: nblocks = 1 + (nrows - 1) / sliceheight;
690: matmult_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
691: } else {
692: nblocks = 1 + (nrows - 1) / sliceheight;
693: matmult_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y);
694: }
695: }
696: break;
697: #endif
698: default:
699: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
700: }
701: PetscCall(PetscLogGpuTimeEnd());
702: PetscCall(VecCUDARestoreArrayRead(xx, &x));
703: PetscCall(VecCUDARestoreArrayWrite(yy, &y));
704: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
705: PetscFunctionReturn(PETSC_SUCCESS);
706: }
708: static PetscErrorCode MatMultAdd_SeqSELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
709: {
710: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
711: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
712: PetscScalar *z;
713: const PetscScalar *y, *x;
714: PetscInt nrows = A->rmap->n, sliceheight = a->sliceheight;
715: MatScalar *aval = cudastruct->val;
716: PetscInt *acolidx = cudastruct->colidx;
717: PetscInt *sliidx = cudastruct->sliidx;
718: #if !defined(PETSC_USE_COMPLEX)
719: PetscReal maxoveravg;
720: PetscInt chunksperblock, nchunks, *chunk_slice_map;
721: PetscInt blocky = cudastruct->blocky;
722: #endif
724: PetscFunctionBegin;
725: PetscCheck(32 % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height be a divisor of 32, but the input matrix has a slice height of %" PetscInt_FMT, sliceheight);
726: PetscCheck(!(cudastruct->kernelchoice >= 2 && cudastruct->kernelchoice <= 6 && sliceheight != SLICE_HEIGHT), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Kernel choices {2-6} requires the slice height of the matrix be 16, but the current slice height is %" PetscInt_FMT, sliceheight);
727: PetscCall(MatSeqSELLCUDACopyToGPU(A));
728: if (a->nz) {
729: PetscInt nblocks, blocksize = 512;
730: dim3 block2(256, 2), block4(128, 4), block8(64, 8), block16(32, 16), block32(16, 32);
731: PetscCall(VecCUDAGetArrayRead(xx, &x));
732: PetscCall(VecCUDAGetArrayRead(yy, &y));
733: PetscCall(VecCUDAGetArrayWrite(zz, &z));
734: PetscCall(PetscLogGpuTimeBegin());
736: switch (cudastruct->kernelchoice) {
737: #if !defined(PETSC_USE_COMPLEX)
738: case 9:
739: nblocks = 1 + (nrows - 1) / sliceheight;
740: if (blocky == 2) {
741: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
742: } else if (blocky == 4) {
743: matmultadd_seqsell_tiled_kernel9<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
744: } else if (blocky == 8) {
745: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
746: } else if (blocky == 16) {
747: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
748: } else if (blocky == 32) {
749: matmultadd_seqsell_tiled_kernel9<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
750: } else {
751: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
752: }
753: break;
754: case 8:
755: /* each block handles approximately one slice */
756: nchunks = cudastruct->totalchunks;
757: blocky = a->chunksize / 32;
758: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
759: nblocks = 1 + (nchunks - 1) / chunksperblock;
760: chunk_slice_map = cudastruct->chunk_slice_map;
761: if (blocky == 2) {
762: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
763: } else if (blocky == 4) {
764: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
765: } else if (blocky == 8) {
766: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
767: } else if (blocky == 16) {
768: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
769: } else if (blocky == 32) {
770: matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
771: } else {
772: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
773: }
774: break;
775: case 7:
776: nblocks = 1 + (nrows - 1) / (2 * sliceheight);
777: if (blocky == 2) {
778: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
779: } else if (blocky == 4) {
780: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
781: } else if (blocky == 8) {
782: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
783: } else if (blocky == 16) {
784: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
785: } else if (blocky == 32) {
786: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
787: } else {
788: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
789: }
790: break;
791: #endif
792: case 6:
793: nblocks = 1 + (nrows - 1) / (blocksize / 32);
794: matmultadd_seqsell_tiled_kernel6<<<nblocks, block32>>>(nrows, acolidx, aval, sliidx, x, y, z);
795: break;
796: case 5:
797: nblocks = 1 + (nrows - 1) / (blocksize / 16);
798: matmultadd_seqsell_tiled_kernel5<<<nblocks, block16>>>(nrows, acolidx, aval, sliidx, x, y, z);
799: break;
800: case 4:
801: nblocks = 1 + (nrows - 1) / (blocksize / 8);
802: matmultadd_seqsell_tiled_kernel4<<<nblocks, block8>>>(nrows, acolidx, aval, sliidx, x, y, z);
803: break;
804: case 3:
805: nblocks = 1 + (nrows - 1) / (blocksize / 4);
806: matmultadd_seqsell_tiled_kernel3<<<nblocks, block4>>>(nrows, acolidx, aval, sliidx, x, y, z);
807: break;
808: case 2:
809: nblocks = 1 + (nrows - 1) / (blocksize / 2);
810: matmultadd_seqsell_tiled_kernel2<<<nblocks, block2>>>(nrows, acolidx, aval, sliidx, x, y, z);
811: break;
812: case 1:
813: nblocks = 1 + (nrows - 1) / blocksize;
814: matmultadd_seqsell_basic_kernel<<<nblocks, blocksize>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
815: break;
816: #if !defined(PETSC_USE_COMPLEX)
817: case 0:
818: maxoveravg = a->maxslicewidth / a->avgslicewidth;
819: if (maxoveravg > 12.0 && maxoveravg / nrows > 0.001) { /* important threshold */
820: /* each block handles approximately one slice */
821: nchunks = cudastruct->totalchunks;
822: blocky = a->chunksize / 32;
823: chunksperblock = cudastruct->chunksperblock ? cudastruct->chunksperblock : 1 + (cudastruct->totalentries / cudastruct->totalslices - 1) / a->chunksize;
824: nblocks = 1 + (nchunks - 1) / chunksperblock;
825: chunk_slice_map = cudastruct->chunk_slice_map;
826: if (blocky == 2) {
827: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
828: } else if (blocky == 4) {
829: matmultadd_seqsell_tiled_kernel8<4><<<nblocks, dim3(32, 4)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
830: } else if (blocky == 8) {
831: matmultadd_seqsell_tiled_kernel8<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
832: } else if (blocky == 16) {
833: matmultadd_seqsell_tiled_kernel8<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
834: } else if (blocky == 32) {
835: matmultadd_seqsell_tiled_kernel8<32><<<nblocks, dim3(32, 32)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
836: } else {
837: matmultadd_seqsell_tiled_kernel8<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, chunksperblock, nchunks, chunk_slice_map, acolidx, aval, sliidx, x, y, z);
838: }
839: } else {
840: PetscInt avgslicesize = sliceheight * a->avgslicewidth;
841: if (avgslicesize <= 432) {
842: if (sliceheight * a->maxslicewidth < 2048 && nrows > 100000) {
843: nblocks = 1 + (nrows - 1) / (2 * sliceheight); /* two slices per block */
844: matmultadd_seqsell_tiled_kernel7<<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
845: } else {
846: nblocks = 1 + (nrows - 1) / sliceheight;
847: matmultadd_seqsell_tiled_kernel9<2><<<nblocks, dim3(32, 2)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
848: }
849: } else if (avgslicesize <= 2400) {
850: nblocks = 1 + (nrows - 1) / sliceheight;
851: matmultadd_seqsell_tiled_kernel9<8><<<nblocks, dim3(32, 8)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
852: } else {
853: nblocks = 1 + (nrows - 1) / sliceheight;
854: matmultadd_seqsell_tiled_kernel9<16><<<nblocks, dim3(32, 16)>>>(nrows, sliceheight, acolidx, aval, sliidx, x, y, z);
855: }
856: }
857: break;
858: #endif
859: default:
860: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported kernel choice %" PetscInt_FMT " for MatMult_SeqSELLCUDA.", cudastruct->kernelchoice);
861: }
862: PetscCall(PetscLogGpuTimeEnd());
863: PetscCall(VecCUDARestoreArrayRead(xx, &x));
864: PetscCall(VecCUDARestoreArrayRead(yy, &y));
865: PetscCall(VecCUDARestoreArrayWrite(zz, &z));
866: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
867: } else {
868: PetscCall(VecCopy(yy, zz));
869: }
870: PetscFunctionReturn(PETSC_SUCCESS);
871: }
873: static PetscErrorCode MatSetFromOptions_SeqSELLCUDA(Mat A, PetscOptionItems PetscOptionsObject)
874: {
875: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
876: PetscInt kernel, blocky;
877: PetscBool flg;
879: PetscFunctionBegin;
880: PetscOptionsHeadBegin(PetscOptionsObject, "SeqSELLCUDA options");
881: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_blocky", &blocky, &flg));
882: if (flg) {
883: PetscCheck(blocky == 2 || blocky == 4 || blocky == 8 || blocky == 16 || blocky == 32, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Unsupported blocky: %" PetscInt_FMT " it should be in {2,4,8,16,32}", blocky);
884: cudastruct->blocky = blocky;
885: }
886: PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_kernel", &kernel, &flg));
887: if (flg) {
888: PetscCheck(kernel >= 0 && kernel <= 9, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Wrong kernel choice: %" PetscInt_FMT " it should be in [0,9]", kernel);
889: cudastruct->kernelchoice = kernel;
890: if (kernel == 8) PetscCall(PetscOptionsGetInt(NULL, NULL, "-mat_sell_spmv_cuda_chunksperblock", &cudastruct->chunksperblock, &flg));
891: }
892: PetscOptionsHeadEnd();
893: PetscFunctionReturn(PETSC_SUCCESS);
894: }
896: PETSC_INTERN PetscErrorCode MatAssemblyEnd_SpMV_Preprocessing_Private(Mat A)
897: {
898: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
900: PetscFunctionBegin;
901: PetscCall(MatSeqSELLGetAvgSliceWidth(A, &a->avgslicewidth));
902: PetscCall(MatSeqSELLGetMaxSliceWidth(A, &a->maxslicewidth));
903: PetscCall(MatSeqSELLGetFillRatio(A, &a->fillratio));
904: PetscFunctionReturn(PETSC_SUCCESS);
905: }
907: static PetscErrorCode MatAssemblyEnd_SeqSELLCUDA(Mat A, MatAssemblyType mode)
908: {
909: PetscFunctionBegin;
910: PetscCall(MatAssemblyEnd_SeqSELL(A, mode));
911: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(A));
912: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
913: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqSELLCUDACopyToGPU(A));
914: A->ops->mult = MatMult_SeqSELLCUDA;
915: A->ops->multadd = MatMultAdd_SeqSELLCUDA;
916: PetscFunctionReturn(PETSC_SUCCESS);
917: }
919: static PetscErrorCode MatZeroEntries_SeqSELLCUDA(Mat A)
920: {
921: PetscBool both = PETSC_FALSE;
922: Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
924: PetscFunctionBegin;
925: if (A->factortype == MAT_FACTOR_NONE) {
926: Mat_SeqSELLCUDA *cudastruct = (Mat_SeqSELLCUDA *)A->spptr;
927: if (cudastruct->val) {
928: both = PETSC_TRUE;
929: PetscCallCUDA(cudaMemset(cudastruct->val, 0, a->sliidx[a->totalslices] * sizeof(*cudastruct->val)));
930: }
931: }
932: PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
933: PetscCall(MatSeqSELLInvalidateDiagonal(A));
934: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
935: else A->offloadmask = PETSC_OFFLOAD_CPU;
936: PetscFunctionReturn(PETSC_SUCCESS);
937: }
939: static PetscErrorCode MatDestroy_SeqSELLCUDA(Mat A)
940: {
941: PetscFunctionBegin;
942: if (A->factortype == MAT_FACTOR_NONE && A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) PetscCall(MatSeqSELLCUDA_Destroy((Mat_SeqSELLCUDA **)&A->spptr));
943: PetscCall(MatDestroy_SeqSELL(A));
944: PetscFunctionReturn(PETSC_SUCCESS);
945: }
947: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
948: static PetscErrorCode MatDuplicate_SeqSELLCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
949: {
950: PetscFunctionBegin;
951: PetscCall(MatDuplicate_SeqSELL(A, cpvalues, B));
952: PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(*B));
953: PetscFunctionReturn(PETSC_SUCCESS);
954: }
956: PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat B)
957: {
958: Mat_SeqSELLCUDA *cudastruct;
960: PetscFunctionBegin;
961: PetscCall(PetscFree(B->defaultvectype));
962: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
964: if (!B->spptr) {
965: if (B->factortype == MAT_FACTOR_NONE) {
966: PetscCall(PetscNew(&cudastruct));
967: B->spptr = cudastruct;
968: }
969: }
971: B->ops->assemblyend = MatAssemblyEnd_SeqSELLCUDA;
972: B->ops->destroy = MatDestroy_SeqSELLCUDA;
973: B->ops->setfromoptions = MatSetFromOptions_SeqSELLCUDA;
974: B->ops->mult = MatMult_SeqSELLCUDA;
975: B->ops->multadd = MatMultAdd_SeqSELLCUDA;
976: B->ops->duplicate = MatDuplicate_SeqSELLCUDA;
977: B->ops->zeroentries = MatZeroEntries_SeqSELLCUDA;
979: /* No need to assemble SeqSELL, but need to do the preprocessing for SpMV */
980: PetscCall(MatAssemblyEnd_SpMV_Preprocessing_Private(B));
982: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELLCUDA));
983: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: /*MC
988: MATSEQSELLCUDA - MATSELLCUDA = "(seq)sellcuda" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
990: Options Database Keys:
991: + -mat_type seqsellcuda - sets the matrix type to "seqsellcuda" during a call to `MatSetFromOptions()`
992: . -mat_sell_spmv_cuda_kernel - selects a spmv kernel for MatSELLCUDA
993: - -mat_sell_spmv_cuda_blocky - sets the y dimension of the block size of the spmv kernels. These kernels use a 2D block with the x dimension being 32
995: Level: beginner
997: .seealso: [](ch_matrices), `Mat`, `MATSELLCUDA`
998: M*/
1000: PETSC_EXTERN PetscErrorCode MatCreate_SeqSELLCUDA(Mat B)
1001: {
1002: PetscFunctionBegin;
1003: PetscCall(MatCreate_SeqSELL(B));
1004: PetscCall(MatConvert_SeqSELL_SeqSELLCUDA(B));
1005: PetscCall(MatSetFromOptions(B));
1006: PetscFunctionReturn(PETSC_SUCCESS);
1007: }