Actual source code: ex5cu.cu

  1: static char help[] = "Test of CUDA matrix assemble with simple matrix.\n\n";

  3: // This a minimal example of the use of the CUDA MatAIJ metadata for assembly.
  4: //
  5: // The matrix must be a type 'aijcusparse' and must first be assembled on the CPU to provide the nonzero pattern.
  6: // Next, get a pointer to a simple CSR mirror (PetscSplitCSRDataStructure) of the matrix data on
  7: //    the GPU with MatCUSPARSEGetDeviceMatWrite().
  8: // Then use this object to populate the matrix on the GPU with MatSetValuesDevice().
  9: // Finally call MatAssemblyBegin/End() and the matrix is ready to use on the GPU without matrix data movement between the
 10: //    host and GPU.

 12: #include <petscconf.h>
 13: #include <petscmat.h>
 14: #include <petscdevice_cuda.h>
 15: #include <assert.h>

 17: #include <petscaijdevice.h>
 18: __global__ void assemble_on_gpu(PetscSplitCSRDataStructure d_mat, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
 19: {
 20:   const PetscInt inc = blockDim.x, my0 = threadIdx.x;
 21:   PetscInt       i;

 24:   for (i = start + my0; i < end + 1; i += inc) {
 25:     PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2; // negative indices are igored but >= N are not, so clip end
 26:     PetscScalar values[] = {1, 1, 1, 1};
 27:     MatSetValuesDevice(d_mat, nn, js, nn, js, values, ADD_VALUES);
 28:     if (ierr) assert(0);
 29:   }
 30: }

 32: PetscErrorCode assemble_on_cpu(Mat A, PetscInt start, PetscInt end, PetscInt N, PetscMPIInt rank)
 33: {
 35:   for (PetscInt i = start; i < end + 1; i++) {
 36:     PetscInt    js[] = {i - 1, i}, nn = (i == N) ? 1 : 2;
 37:     PetscScalar values[] = {1, 1, 1, 1};
 38:     MatSetValues(A, nn, js, nn, js, values, ADD_VALUES);
 39:   }
 40:   return 0;
 41: }

 43: int main(int argc, char **args)
 44: {
 45:   Mat                        A;
 46:   PetscInt                   N = 11, nz = 3, Istart, Iend, num_threads = 128;
 47:   PetscSplitCSRDataStructure d_mat;
 48:   PetscLogEvent              event;
 49:   PetscMPIInt                rank, size;
 50:   PetscBool                  testmpiseq = PETSC_FALSE;
 51:   Vec                        x, y;

 54:   PetscInitialize(&argc, &args, (char *)0, help);
 55:   PetscOptionsGetInt(NULL, NULL, "-n", &N, NULL);
 56:   PetscOptionsGetInt(NULL, NULL, "-num_threads", &num_threads, NULL);
 57:   PetscOptionsGetInt(NULL, NULL, "-nz_row", &nz, NULL);
 58:   PetscOptionsGetBool(NULL, NULL, "-testmpiseq", &testmpiseq, NULL);
 59:   if (nz < 3) nz = 3;
 60:   if (nz > N + 1) nz = N + 1;
 61:   MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
 62:   MPI_Comm_size(PETSC_COMM_WORLD, &size);

 64:   PetscLogEventRegister("GPU operator", MAT_CLASSID, &event);
 65:   MatCreateAIJCUSPARSE(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, N, N, nz, NULL, nz - 1, NULL, &A);
 66:   MatSetFromOptions(A);
 67:   MatSetOption(A, MAT_IGNORE_OFF_PROC_ENTRIES, PETSC_TRUE);
 68:   MatCreateVecs(A, &x, &y);
 69:   MatGetOwnershipRange(A, &Istart, &Iend);
 70:   /* current GPU assembly code does not support offprocessor values insertion */
 71:   assemble_on_cpu(A, Istart, Iend, N, rank);
 72:   MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
 73:   MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);

 75:   // test
 76:   VecSet(x, 1.0);
 77:   MatMult(A, x, y);
 78:   VecViewFromOptions(y, NULL, "-ex5_vec_view");

 80:   if (testmpiseq && size == 1) {
 81:     MatConvert(A, MATSEQAIJ, MAT_INPLACE_MATRIX, &A);
 82:     MatConvert(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A);
 83:   }
 84:   PetscLogEventBegin(event, 0, 0, 0, 0);
 85:   MatCUSPARSEGetDeviceMatWrite(A, &d_mat);
 86:   assemble_on_gpu<<<1, num_threads>>>(d_mat, Istart, Iend, N, rank);
 87:   cudaDeviceSynchronize();
 88:   MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
 89:   MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
 90:   PetscLogEventEnd(event, 0, 0, 0, 0);

 92:   // test
 93:   VecSet(x, 1.0);
 94:   MatMult(A, x, y);
 95:   VecViewFromOptions(y, NULL, "-ex5_vec_view");

 97:   MatDestroy(&A);
 98:   VecDestroy(&x);
 99:   VecDestroy(&y);
100:   PetscFinalize();
101:   return 0;
102: }

104: /*TEST

106:    build:
107:       requires: cuda

109:    test:
110:       suffix: 0
111:       diff_args: -j
112:       args: -n 11 -ex5_vec_view
113:       nsize: 1

115:    test:
116:       suffix: 1
117:       diff_args: -j
118:       args: -n 11 -ex5_vec_view
119:       nsize: 2

121:    test:
122:       suffix: 2
123:       diff_args: -j
124:       args: -n 11 -testmpiseq -ex5_vec_view
125:       nsize: 1

127: TEST*/