Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 15: #include <thrust/async/for_each.h>

 17: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 19:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 20:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 22:   typedef enum {
 23:       CUSPARSE_MV_ALG_DEFAULT = 0,
 24:       CUSPARSE_COOMV_ALG      = 1,
 25:       CUSPARSE_CSRMV_ALG1     = 2,
 26:       CUSPARSE_CSRMV_ALG2     = 3
 27:   } cusparseSpMVAlg_t;

 29:   typedef enum {
 30:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 31:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 32:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 33:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 34:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 35:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 36:       CUSPARSE_SPMM_COO_ALG1    = 1,
 37:       CUSPARSE_SPMM_COO_ALG2    = 2,
 38:       CUSPARSE_SPMM_COO_ALG3    = 3,
 39:       CUSPARSE_SPMM_COO_ALG4    = 5,
 40:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 41:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 42:   } cusparseSpMMAlg_t;

 44:   typedef enum {
 45:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 46:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 47:   } cusparseCsr2CscAlg_t;
 48:   */
 49:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 50:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 51:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 52: #endif

 54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 67: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
 68: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
 69: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 71: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 72: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 73: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 74: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 75: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 77: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 78: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 79: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 80: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 81: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 83: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
 84: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 85: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);

 87: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
 88: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);

 90: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);

 92: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 93: {
 94:   cusparseStatus_t   stat;
 95:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 98:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
 99:   cusparsestruct->stream = stream;
100:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101:   return(0);
102: }

104: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105: {
106:   cusparseStatus_t   stat;
107:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

110:   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111:   if (cusparsestruct->handle != handle) {
112:     if (cusparsestruct->handle) {
113:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114:     }
115:     cusparsestruct->handle = handle;
116:   }
117:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118:   return(0);
119: }

121: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122: {
123:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124:   PetscBool          flg;
125:   PetscErrorCode     ierr;

128:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
129:   if (!flg || !cusparsestruct) return(0);
130:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131:   return(0);
132: }

134: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135: {
137:   *type = MATSOLVERCUSPARSE;
138:   return(0);
139: }

141: /*MC
142:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147:   algorithms are not recommended. This class does NOT support direct solver operations.

149:   Level: beginner

151: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152: M*/

154: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155: {
157:   PetscInt       n = A->rmap->n;

160:   MatCreate(PetscObjectComm((PetscObject)A),B);
161:   MatSetSizes(*B,n,n,n,n);
162:   (*B)->factortype = ftype;
163:   MatSetType(*B,MATSEQAIJCUSPARSE);

165:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166:     MatSetBlockSizesFromMats(*B,A,A);
167:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
170:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
171:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
172:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175:     PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
176:     PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
177:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

179:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
180:   (*B)->canuseordering = PETSC_TRUE;
181:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
182:   return(0);
183: }

185: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186: {
187:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

190:   switch (op) {
191:   case MAT_CUSPARSE_MULT:
192:     cusparsestruct->format = format;
193:     break;
194:   case MAT_CUSPARSE_ALL:
195:     cusparsestruct->format = format;
196:     break;
197:   default:
198:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199:   }
200:   return(0);
201: }

203: /*@
204:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205:    operation. Only the MatMult operation can use different GPU storage formats
206:    for MPIAIJCUSPARSE matrices.
207:    Not Collective

209:    Input Parameters:
210: +  A - Matrix of type SEQAIJCUSPARSE
211: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

214:    Output Parameter:

216:    Level: intermediate

218: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219: @*/
220: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221: {

226:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
227:   return(0);
228: }

230: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231: {

235:   switch (op) {
236:     case MAT_FORM_EXPLICIT_TRANSPOSE:
237:       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238:       if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
239:       A->form_explicit_transpose = flg;
240:       break;
241:     default:
242:       MatSetOption_SeqAIJ(A,op,flg);
243:       break;
244:   }
245:   return(0);
246: }

248: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

250: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251: {
252:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253:   IS             isrow = b->row,iscol = b->col;
254:   PetscBool      row_identity,col_identity;

258:   MatSeqAIJCUSPARSECopyFromGPU(A);
259:   MatLUFactorNumeric_SeqAIJ(B,A,info);
260:   B->offloadmask = PETSC_OFFLOAD_CPU;
261:   /* determine which version of MatSolve needs to be used. */
262:   ISIdentity(isrow,&row_identity);
263:   ISIdentity(iscol,&col_identity);
264:   if (row_identity && col_identity) {
265:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267:     B->ops->matsolve = NULL;
268:     B->ops->matsolvetranspose = NULL;
269:   } else {
270:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272:     B->ops->matsolve = NULL;
273:     B->ops->matsolvetranspose = NULL;
274:   }

276:   /* get the triangular factors */
277:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
278:   return(0);
279: }

281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283:   PetscErrorCode           ierr;
284:   MatCUSPARSEStorageFormat format;
285:   PetscBool                flg;
286:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

289:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
290:   if (A->factortype == MAT_FACTOR_NONE) {
291:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
293:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

295:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
297:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
298:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
301:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303:     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304: #else
305:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306: #endif
307:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
309:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

311:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
313:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314:    #endif
315:   }
316:   PetscOptionsTail();
317:   return(0);
318: }

320: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321: {
322:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323:   PetscErrorCode               ierr;

326:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
327:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
328:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329:   return(0);
330: }

332: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333: {
334:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335:   PetscErrorCode               ierr;

338:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
339:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
340:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341:   return(0);
342: }

344: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345: {
346:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347:   PetscErrorCode               ierr;

350:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
351:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
352:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353:   return(0);
354: }

356: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357: {
358:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359:   PetscErrorCode               ierr;

362:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
363:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
364:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365:   return(0);
366: }

368: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369: {
370:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
371:   PetscInt                          n = A->rmap->n;
372:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374:   cusparseStatus_t                  stat;
375:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
376:   const MatScalar                   *aa = a->a,*v;
377:   PetscInt                          *AiLo, *AjLo;
378:   PetscInt                          i,nz, nzLower, offset, rowOffset;
379:   PetscErrorCode                    ierr;
380:   cudaError_t                       cerr;

383:   if (!n) return(0);
384:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385:     try {
386:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387:       nzLower=n+ai[n]-ai[1];
388:       if (!loTriFactor) {
389:         PetscScalar                       *AALo;

391:         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

393:         /* Allocate Space for the lower triangular matrix */
394:         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395:         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);

397:         /* Fill the lower triangular matrix */
398:         AiLo[0]  = (PetscInt) 0;
399:         AiLo[n]  = nzLower;
400:         AjLo[0]  = (PetscInt) 0;
401:         AALo[0]  = (MatScalar) 1.0;
402:         v        = aa;
403:         vi       = aj;
404:         offset   = 1;
405:         rowOffset= 1;
406:         for (i=1; i<n; i++) {
407:           nz = ai[i+1] - ai[i];
408:           /* additional 1 for the term on the diagonal */
409:           AiLo[i]    = rowOffset;
410:           rowOffset += nz+1;

412:           PetscArraycpy(&(AjLo[offset]), vi, nz);
413:           PetscArraycpy(&(AALo[offset]), v, nz);

415:           offset      += nz;
416:           AjLo[offset] = (PetscInt) i;
417:           AALo[offset] = (MatScalar) 1.0;
418:           offset      += 1;

420:           v  += nz;
421:           vi += nz;
422:         }

424:         /* allocate space for the triangular factor information */
425:         PetscNew(&loTriFactor);
426:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427:         /* Create the matrix description */
428:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432:        #else
433:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434:        #endif
435:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

438:         /* set the operation */
439:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

441:         /* set the matrix */
442:         loTriFactor->csrMat = new CsrMatrix;
443:         loTriFactor->csrMat->num_rows = n;
444:         loTriFactor->csrMat->num_cols = n;
445:         loTriFactor->csrMat->num_entries = nzLower;

447:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

450:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

453:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454:         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

456:         /* Create the solve analysis information */
457:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
458:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466:       #endif

468:         /* perform the solve analysis */
469:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472:                                  loTriFactor->csrMat->column_indices->data().get(),
473:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474:                                  loTriFactor->solveInfo,
475:                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476:                                #else
477:                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478:                                #endif
479:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

482:         /* assign the pointer */
483:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484:         loTriFactor->AA_h = AALo;
485:         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486:         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487:         PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
488:       } else { /* update values only */
489:         if (!loTriFactor->AA_h) {
490:           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491:         }
492:         /* Fill the lower triangular matrix */
493:         loTriFactor->AA_h[0]  = 1.0;
494:         v        = aa;
495:         vi       = aj;
496:         offset   = 1;
497:         for (i=1; i<n; i++) {
498:           nz = ai[i+1] - ai[i];
499:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
500:           offset      += nz;
501:           loTriFactor->AA_h[offset] = 1.0;
502:           offset      += 1;
503:           v  += nz;
504:         }
505:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506:         PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
507:       }
508:     } catch(char *ex) {
509:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510:     }
511:   }
512:   return(0);
513: }

515: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516: {
517:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518:   PetscInt                          n = A->rmap->n;
519:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521:   cusparseStatus_t                  stat;
522:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
523:   const MatScalar                   *aa = a->a,*v;
524:   PetscInt                          *AiUp, *AjUp;
525:   PetscInt                          i,nz, nzUpper, offset;
526:   PetscErrorCode                    ierr;
527:   cudaError_t                       cerr;

530:   if (!n) return(0);
531:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532:     try {
533:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534:       nzUpper = adiag[0]-adiag[n];
535:       if (!upTriFactor) {
536:         PetscScalar *AAUp;

538:         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

540:         /* Allocate Space for the upper triangular matrix */
541:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

544:         /* Fill the upper triangular matrix */
545:         AiUp[0]=(PetscInt) 0;
546:         AiUp[n]=nzUpper;
547:         offset = nzUpper;
548:         for (i=n-1; i>=0; i--) {
549:           v  = aa + adiag[i+1] + 1;
550:           vi = aj + adiag[i+1] + 1;

552:           /* number of elements NOT on the diagonal */
553:           nz = adiag[i] - adiag[i+1]-1;

555:           /* decrement the offset */
556:           offset -= (nz+1);

558:           /* first, set the diagonal elements */
559:           AjUp[offset] = (PetscInt) i;
560:           AAUp[offset] = (MatScalar)1./v[nz];
561:           AiUp[i]      = AiUp[i+1] - (nz+1);

563:           PetscArraycpy(&(AjUp[offset+1]), vi, nz);
564:           PetscArraycpy(&(AAUp[offset+1]), v, nz);
565:         }

567:         /* allocate space for the triangular factor information */
568:         PetscNew(&upTriFactor);
569:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

571:         /* Create the matrix description */
572:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576:        #else
577:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578:        #endif
579:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

582:         /* set the operation */
583:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

585:         /* set the matrix */
586:         upTriFactor->csrMat = new CsrMatrix;
587:         upTriFactor->csrMat->num_rows = n;
588:         upTriFactor->csrMat->num_cols = n;
589:         upTriFactor->csrMat->num_entries = nzUpper;

591:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

594:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

597:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598:         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

600:         /* Create the solve analysis information */
601:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
602:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610:       #endif

612:         /* perform the solve analysis */
613:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616:                                  upTriFactor->csrMat->column_indices->data().get(),
617:                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618:                                  upTriFactor->solveInfo,
619:                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620:                                #else
621:                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622:                                #endif
623:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

626:         /* assign the pointer */
627:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628:         upTriFactor->AA_h = AAUp;
629:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631:         PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
632:       } else {
633:         if (!upTriFactor->AA_h) {
634:           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635:         }
636:         /* Fill the upper triangular matrix */
637:         offset = nzUpper;
638:         for (i=n-1; i>=0; i--) {
639:           v  = aa + adiag[i+1] + 1;

641:           /* number of elements NOT on the diagonal */
642:           nz = adiag[i] - adiag[i+1]-1;

644:           /* decrement the offset */
645:           offset -= (nz+1);

647:           /* first, set the diagonal elements */
648:           upTriFactor->AA_h[offset] = 1./v[nz];
649:           PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
650:         }
651:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652:         PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
653:       }
654:     } catch(char *ex) {
655:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656:     }
657:   }
658:   return(0);
659: }

661: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662: {
663:   PetscErrorCode               ierr;
664:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
665:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666:   IS                           isrow = a->row,iscol = a->icol;
667:   PetscBool                    row_identity,col_identity;
668:   PetscInt                     n = A->rmap->n;

671:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
673:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

675:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676:   cusparseTriFactors->nnz=a->nz;

678:   A->offloadmask = PETSC_OFFLOAD_BOTH;
679:   /* lower triangular indices */
680:   ISIdentity(isrow,&row_identity);
681:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682:     const PetscInt *r;

684:     ISGetIndices(isrow,&r);
685:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686:     cusparseTriFactors->rpermIndices->assign(r, r+n);
687:     ISRestoreIndices(isrow,&r);
688:     PetscLogCpuToGpu(n*sizeof(PetscInt));
689:   }

691:   /* upper triangular indices */
692:   ISIdentity(iscol,&col_identity);
693:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694:     const PetscInt *c;

696:     ISGetIndices(iscol,&c);
697:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698:     cusparseTriFactors->cpermIndices->assign(c, c+n);
699:     ISRestoreIndices(iscol,&c);
700:     PetscLogCpuToGpu(n*sizeof(PetscInt));
701:   }
702:   return(0);
703: }

705: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706: {
707:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711:   cusparseStatus_t                  stat;
712:   PetscErrorCode                    ierr;
713:   cudaError_t                       cerr;
714:   PetscInt                          *AiUp, *AjUp;
715:   PetscScalar                       *AAUp;
716:   PetscScalar                       *AALo;
717:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720:   const MatScalar                   *aa = b->a,*v;

723:   if (!n) return(0);
724:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725:     try {
726:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728:       if (!upTriFactor && !loTriFactor) {
729:         /* Allocate Space for the upper triangular matrix */
730:         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731:         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);

733:         /* Fill the upper triangular matrix */
734:         AiUp[0]=(PetscInt) 0;
735:         AiUp[n]=nzUpper;
736:         offset = 0;
737:         for (i=0; i<n; i++) {
738:           /* set the pointers */
739:           v  = aa + ai[i];
740:           vj = aj + ai[i];
741:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

743:           /* first, set the diagonal elements */
744:           AjUp[offset] = (PetscInt) i;
745:           AAUp[offset] = (MatScalar)1.0/v[nz];
746:           AiUp[i]      = offset;
747:           AALo[offset] = (MatScalar)1.0/v[nz];

749:           offset+=1;
750:           if (nz>0) {
751:             PetscArraycpy(&(AjUp[offset]), vj, nz);
752:             PetscArraycpy(&(AAUp[offset]), v, nz);
753:             for (j=offset; j<offset+nz; j++) {
754:               AAUp[j] = -AAUp[j];
755:               AALo[j] = AAUp[j]/v[nz];
756:             }
757:             offset+=nz;
758:           }
759:         }

761:         /* allocate space for the triangular factor information */
762:         PetscNew(&upTriFactor);
763:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

765:         /* Create the matrix description */
766:         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767:         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770:        #else
771:         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772:        #endif
773:         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774:         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

776:         /* set the matrix */
777:         upTriFactor->csrMat = new CsrMatrix;
778:         upTriFactor->csrMat->num_rows = A->rmap->n;
779:         upTriFactor->csrMat->num_cols = A->cmap->n;
780:         upTriFactor->csrMat->num_entries = a->nz;

782:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

785:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

788:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

791:         /* set the operation */
792:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

794:         /* Create the solve analysis information */
795:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
796:         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799:                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800:                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802:                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803:         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804:       #endif

806:         /* perform the solve analysis */
807:         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808:                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809:                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810:                                  upTriFactor->csrMat->column_indices->data().get(),
811:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812:                                  upTriFactor->solveInfo,
813:                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814:                                 #else
815:                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816:                                 #endif
817:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

820:         /* assign the pointer */
821:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

823:         /* allocate space for the triangular factor information */
824:         PetscNew(&loTriFactor);
825:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

827:         /* Create the matrix description */
828:         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829:         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830:        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832:        #else
833:         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834:        #endif
835:         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836:         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

838:         /* set the operation */
839:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

841:         /* set the matrix */
842:         loTriFactor->csrMat = new CsrMatrix;
843:         loTriFactor->csrMat->num_rows = A->rmap->n;
844:         loTriFactor->csrMat->num_cols = A->cmap->n;
845:         loTriFactor->csrMat->num_entries = a->nz;

847:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

850:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

853:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);

856:         /* Create the solve analysis information */
857:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
858:         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859:       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860:         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861:                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862:                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864:                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865:         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866:       #endif

868:         /* perform the solve analysis */
869:         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870:                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871:                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872:                                  loTriFactor->csrMat->column_indices->data().get(),
873:                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874:                                  loTriFactor->solveInfo,
875:                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876:                                 #else
877:                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878:                                 #endif
879:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

882:         /* assign the pointer */
883:         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

885:         PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
886:         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887:         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888:       } else {
889:         /* Fill the upper triangular matrix */
890:         offset = 0;
891:         for (i=0; i<n; i++) {
892:           /* set the pointers */
893:           v  = aa + ai[i];
894:           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

896:           /* first, set the diagonal elements */
897:           AAUp[offset] = 1.0/v[nz];
898:           AALo[offset] = 1.0/v[nz];

900:           offset+=1;
901:           if (nz>0) {
902:             PetscArraycpy(&(AAUp[offset]), v, nz);
903:             for (j=offset; j<offset+nz; j++) {
904:               AAUp[j] = -AAUp[j];
905:               AALo[j] = AAUp[j]/v[nz];
906:             }
907:             offset+=nz;
908:           }
909:         }
910:         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911:         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912:         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913:         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914:         PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
915:       }
916:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918:     } catch(char *ex) {
919:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920:     }
921:   }
922:   return(0);
923: }

925: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926: {
927:   PetscErrorCode               ierr;
928:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930:   IS                           ip = a->row;
931:   PetscBool                    perm_identity;
932:   PetscInt                     n = A->rmap->n;

935:   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
937:   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

940:   A->offloadmask = PETSC_OFFLOAD_BOTH;

942:   /* lower triangular indices */
943:   ISIdentity(ip,&perm_identity);
944:   if (!perm_identity) {
945:     IS             iip;
946:     const PetscInt *irip,*rip;

948:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
949:     ISGetIndices(iip,&irip);
950:     ISGetIndices(ip,&rip);
951:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955:     ISRestoreIndices(iip,&irip);
956:     ISDestroy(&iip);
957:     ISRestoreIndices(ip,&rip);
958:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
959:   }
960:   return(0);
961: }

963: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964: {
965:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966:   IS             ip = b->row;
967:   PetscBool      perm_identity;

971:   MatSeqAIJCUSPARSECopyFromGPU(A);
972:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
973:   B->offloadmask = PETSC_OFFLOAD_CPU;
974:   /* determine which version of MatSolve needs to be used. */
975:   ISIdentity(ip,&perm_identity);
976:   if (perm_identity) {
977:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979:     B->ops->matsolve = NULL;
980:     B->ops->matsolvetranspose = NULL;
981:   } else {
982:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984:     B->ops->matsolve = NULL;
985:     B->ops->matsolvetranspose = NULL;
986:   }

988:   /* get the triangular factors */
989:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
990:   return(0);
991: }

993: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994: {
995:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000:   cusparseStatus_t                  stat;
1001:   cusparseIndexBase_t               indexBase;
1002:   cusparseMatrixType_t              matrixType;
1003:   cusparseFillMode_t                fillMode;
1004:   cusparseDiagType_t                diagType;
1005:   cudaError_t                       cerr;
1006:   PetscErrorCode                    ierr;

1009:   /* allocate space for the transpose of the lower triangular factor */
1010:   PetscNew(&loTriFactorT);
1011:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1013:   /* set the matrix descriptors of the lower triangular factor */
1014:   matrixType = cusparseGetMatType(loTriFactor->descr);
1015:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

1020:   /* Create the matrix description */
1021:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1027:   /* set the operation */
1028:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1030:   /* allocate GPU space for the CSC of the lower triangular factor*/
1031:   loTriFactorT->csrMat = new CsrMatrix;
1032:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1039:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043:                                        loTriFactor->csrMat->values->data().get(),
1044:                                        loTriFactor->csrMat->row_offsets->data().get(),
1045:                                        loTriFactor->csrMat->column_indices->data().get(),
1046:                                        loTriFactorT->csrMat->values->data().get(),
1047:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050:   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051: #endif

1053:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1054:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056:                           loTriFactor->csrMat->values->data().get(),
1057:                           loTriFactor->csrMat->row_offsets->data().get(),
1058:                           loTriFactor->csrMat->column_indices->data().get(),
1059:                           loTriFactorT->csrMat->values->data().get(),
1060:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064:                         #else
1065:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067:                         #endif
1068:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1071:   /* Create the solve analysis information */
1072:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1073:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081: #endif

1083:   /* perform the solve analysis */
1084:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087:                            loTriFactorT->csrMat->column_indices->data().get(),
1088:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089:                            loTriFactorT->solveInfo,
1090:                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091:                           #else
1092:                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093:                           #endif
1094:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1097:   /* assign the pointer */
1098:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1100:   /*********************************************/
1101:   /* Now the Transpose of the Upper Tri Factor */
1102:   /*********************************************/

1104:   /* allocate space for the transpose of the upper triangular factor */
1105:   PetscNew(&upTriFactorT);
1106:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1108:   /* set the matrix descriptors of the upper triangular factor */
1109:   matrixType = cusparseGetMatType(upTriFactor->descr);
1110:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1115:   /* Create the matrix description */
1116:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1122:   /* set the operation */
1123:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1125:   /* allocate GPU space for the CSC of the upper triangular factor*/
1126:   upTriFactorT->csrMat = new CsrMatrix;
1127:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1134:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138:                                 upTriFactor->csrMat->values->data().get(),
1139:                                 upTriFactor->csrMat->row_offsets->data().get(),
1140:                                 upTriFactor->csrMat->column_indices->data().get(),
1141:                                 upTriFactorT->csrMat->values->data().get(),
1142:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146: #endif

1148:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1149:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151:                           upTriFactor->csrMat->values->data().get(),
1152:                           upTriFactor->csrMat->row_offsets->data().get(),
1153:                           upTriFactor->csrMat->column_indices->data().get(),
1154:                           upTriFactorT->csrMat->values->data().get(),
1155:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159:                         #else
1160:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161:                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162:                         #endif

1164:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);

1167:   /* Create the solve analysis information */
1168:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1169:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170:   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177:   #endif

1179:   /* perform the solve analysis */
1180:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183:                            upTriFactorT->csrMat->column_indices->data().get(),
1184:                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185:                            upTriFactorT->solveInfo,
1186:                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187:                           #else
1188:                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189:                           #endif

1191:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);

1194:   /* assign the pointer */
1195:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196:   return(0);
1197: }

1199: struct PetscScalarToPetscInt
1200: {
1201:   __host__ __device__
1202:   PetscInt operator()(PetscScalar s)
1203:   {
1204:     return (PetscInt)PetscRealPart(s);
1205:   }
1206: };

1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213:   cusparseStatus_t             stat;
1214:   cusparseIndexBase_t          indexBase;
1215:   cudaError_t                  err;
1216:   PetscErrorCode               ierr;

1219:   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220:   MatSeqAIJCUSPARSECopyToGPU(A);
1221:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222:   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1223:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224:   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1225:   if (A->transupdated) return(0);
1226:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227:   PetscLogGpuTimeBegin();
1228:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1229:     MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1230:   }
1231:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1233:     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1234:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1235:     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1236:     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1238:     /* set alpha and beta */
1239:     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1240:     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1241:     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1242:     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243:     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244:     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);

1246:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247:       CsrMatrix *matrixT = new CsrMatrix;
1248:       matstructT->mat = matrixT;
1249:       matrixT->num_rows = A->cmap->n;
1250:       matrixT->num_cols = A->rmap->n;
1251:       matrixT->num_entries = a->nz;
1252:       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1253:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254:       matrixT->values = new THRUSTARRAY(a->nz);

1256:       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1257:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1259:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260:       stat = cusparseCreateCsr(&matstructT->matDescr,
1261:                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262:                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263:                                matrixT->values->data().get(),
1264:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265:                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266:      #endif
1267:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1268:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270:    #else
1271:       CsrMatrix *temp  = new CsrMatrix;
1272:       CsrMatrix *tempT = new CsrMatrix;
1273:       /* First convert HYB to CSR */
1274:       temp->num_rows = A->rmap->n;
1275:       temp->num_cols = A->cmap->n;
1276:       temp->num_entries = a->nz;
1277:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279:       temp->values = new THRUSTARRAY(a->nz);

1281:       stat = cusparse_hyb2csr(cusparsestruct->handle,
1282:                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283:                               temp->values->data().get(),
1284:                               temp->row_offsets->data().get(),
1285:                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1287:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288:       tempT->num_rows = A->rmap->n;
1289:       tempT->num_cols = A->cmap->n;
1290:       tempT->num_entries = a->nz;
1291:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293:       tempT->values = new THRUSTARRAY(a->nz);

1295:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296:                               temp->num_cols, temp->num_entries,
1297:                               temp->values->data().get(),
1298:                               temp->row_offsets->data().get(),
1299:                               temp->column_indices->data().get(),
1300:                               tempT->values->data().get(),
1301:                               tempT->column_indices->data().get(),
1302:                               tempT->row_offsets->data().get(),
1303:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1305:       /* Last, convert CSC to HYB */
1306:       cusparseHybMat_t hybMat;
1307:       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308:       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309:         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311:                               matstructT->descr, tempT->values->data().get(),
1312:                               tempT->row_offsets->data().get(),
1313:                               tempT->column_indices->data().get(),
1314:                               hybMat, 0, partition);CHKERRCUSPARSE(stat);

1316:       /* assign the pointer */
1317:       matstructT->mat = hybMat;
1318:       A->transupdated = PETSC_TRUE;
1319:       /* delete temporaries */
1320:       if (tempT) {
1321:         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1322:         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1323:         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1324:         delete (CsrMatrix*) tempT;
1325:       }
1326:       if (temp) {
1327:         if (temp->values) delete (THRUSTARRAY*) temp->values;
1328:         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1329:         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1330:         delete (CsrMatrix*) temp;
1331:       }
1332:      #endif
1333:     }
1334:   }
1335:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1336:     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1337:     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1338:     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1339:     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1340:     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1341:     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1342:     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1343:     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1344:     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1345:     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1346:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1347:       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1348:       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1349:       PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1350:     }
1351:     if (!cusparsestruct->csr2csc_i) {
1352:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1353:       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1355:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1356:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357:       void   *csr2cscBuffer;
1358:       size_t csr2cscBufferSize;
1359:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1360:                                            A->cmap->n, matrix->num_entries,
1361:                                            matrix->values->data().get(),
1362:                                            cusparsestruct->rowoffsets_gpu->data().get(),
1363:                                            matrix->column_indices->data().get(),
1364:                                            matrixT->values->data().get(),
1365:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1366:                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1367:                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1368:       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1369:      #endif

1371:       if (matrix->num_entries) {
1372:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1376:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377:            should be filled with indexBase. So I just take a shortcut here.
1378:         */
1379:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1380:                               A->cmap->n,matrix->num_entries,
1381:                               csr2csc_a.data().get(),
1382:                               cusparsestruct->rowoffsets_gpu->data().get(),
1383:                               matrix->column_indices->data().get(),
1384:                               matrixT->values->data().get(),
1385:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386:                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387:                               CUSPARSE_ACTION_NUMERIC,indexBase,
1388:                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1389:                              #else
1390:                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1391:                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1392:                              #endif
1393:       } else {
1394:         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1395:       }

1397:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1398:       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1399:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1400:       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1401:      #endif
1402:     }
1403:     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1404:                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1405:                                                      matrixT->values->begin()));
1406:   }
1407:   PetscLogGpuTimeEnd();
1408:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1409:   /* the compressed row indices is not used for matTranspose */
1410:   matstructT->cprowIndices = NULL;
1411:   /* assign the pointer */
1412:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1413:   A->transupdated = PETSC_TRUE;
1414:   return(0);
1415: }

1417: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1418: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1419: {
1420:   PetscInt                              n = xx->map->n;
1421:   const PetscScalar                     *barray;
1422:   PetscScalar                           *xarray;
1423:   thrust::device_ptr<const PetscScalar> bGPU;
1424:   thrust::device_ptr<PetscScalar>       xGPU;
1425:   cusparseStatus_t                      stat;
1426:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1427:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1428:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1429:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1430:   PetscErrorCode                        ierr;

1433:   /* Analyze the matrix and create the transpose ... on the fly */
1434:   if (!loTriFactorT && !upTriFactorT) {
1435:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1436:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438:   }

1440:   /* Get the GPU pointers */
1441:   VecCUDAGetArrayWrite(xx,&xarray);
1442:   VecCUDAGetArrayRead(bb,&barray);
1443:   xGPU = thrust::device_pointer_cast(xarray);
1444:   bGPU = thrust::device_pointer_cast(barray);

1446:   PetscLogGpuTimeBegin();
1447:   /* First, reorder with the row permutation */
1448:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1449:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1450:                xGPU);

1452:   /* First, solve U */
1453:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1454:                         upTriFactorT->csrMat->num_rows,
1455:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1456:                         upTriFactorT->csrMat->num_entries,
1457:                       #endif
1458:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1459:                         upTriFactorT->csrMat->values->data().get(),
1460:                         upTriFactorT->csrMat->row_offsets->data().get(),
1461:                         upTriFactorT->csrMat->column_indices->data().get(),
1462:                         upTriFactorT->solveInfo,
1463:                         xarray,
1464:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465:                         tempGPU->data().get(),
1466:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1467:                       #else
1468:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1469:                       #endif

1471:   /* Then, solve L */
1472:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1473:                         loTriFactorT->csrMat->num_rows,
1474:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475:                         loTriFactorT->csrMat->num_entries,
1476:                       #endif
1477:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1478:                         loTriFactorT->csrMat->values->data().get(),
1479:                         loTriFactorT->csrMat->row_offsets->data().get(),
1480:                         loTriFactorT->csrMat->column_indices->data().get(),
1481:                         loTriFactorT->solveInfo,
1482:                         tempGPU->data().get(),
1483:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484:                         xarray,
1485:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1486:                       #else
1487:                          xarray);CHKERRCUSPARSE(stat);
1488:                       #endif

1490:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1491:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1492:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1493:                tempGPU->begin());

1495:   /* Copy the temporary to the full solution. */
1496:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);

1498:   /* restore */
1499:   VecCUDARestoreArrayRead(bb,&barray);
1500:   VecCUDARestoreArrayWrite(xx,&xarray);
1501:   PetscLogGpuTimeEnd();
1502:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1503:   return(0);
1504: }

1506: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1507: {
1508:   const PetscScalar                 *barray;
1509:   PetscScalar                       *xarray;
1510:   cusparseStatus_t                  stat;
1511:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1512:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1513:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1514:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1515:   PetscErrorCode                    ierr;

1518:   /* Analyze the matrix and create the transpose ... on the fly */
1519:   if (!loTriFactorT && !upTriFactorT) {
1520:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1521:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523:   }

1525:   /* Get the GPU pointers */
1526:   VecCUDAGetArrayWrite(xx,&xarray);
1527:   VecCUDAGetArrayRead(bb,&barray);

1529:   PetscLogGpuTimeBegin();
1530:   /* First, solve U */
1531:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532:                         upTriFactorT->csrMat->num_rows,
1533:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534:                         upTriFactorT->csrMat->num_entries,
1535:                       #endif
1536:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537:                         upTriFactorT->csrMat->values->data().get(),
1538:                         upTriFactorT->csrMat->row_offsets->data().get(),
1539:                         upTriFactorT->csrMat->column_indices->data().get(),
1540:                         upTriFactorT->solveInfo,
1541:                         barray,
1542:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543:                         tempGPU->data().get(),
1544:                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545:                       #else
1546:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1547:                       #endif

1549:   /* Then, solve L */
1550:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551:                         loTriFactorT->csrMat->num_rows,
1552:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553:                         loTriFactorT->csrMat->num_entries,
1554:                       #endif
1555:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556:                         loTriFactorT->csrMat->values->data().get(),
1557:                         loTriFactorT->csrMat->row_offsets->data().get(),
1558:                         loTriFactorT->csrMat->column_indices->data().get(),
1559:                         loTriFactorT->solveInfo,
1560:                         tempGPU->data().get(),
1561:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562:                         xarray,
1563:                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564:                       #else
1565:                         xarray);CHKERRCUSPARSE(stat);
1566:                       #endif

1568:   /* restore */
1569:   VecCUDARestoreArrayRead(bb,&barray);
1570:   VecCUDARestoreArrayWrite(xx,&xarray);
1571:   PetscLogGpuTimeEnd();
1572:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1573:   return(0);
1574: }

1576: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1577: {
1578:   const PetscScalar                     *barray;
1579:   PetscScalar                           *xarray;
1580:   thrust::device_ptr<const PetscScalar> bGPU;
1581:   thrust::device_ptr<PetscScalar>       xGPU;
1582:   cusparseStatus_t                      stat;
1583:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1584:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1585:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1586:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1587:   PetscErrorCode                        ierr;


1591:   /* Get the GPU pointers */
1592:   VecCUDAGetArrayWrite(xx,&xarray);
1593:   VecCUDAGetArrayRead(bb,&barray);
1594:   xGPU = thrust::device_pointer_cast(xarray);
1595:   bGPU = thrust::device_pointer_cast(barray);

1597:   PetscLogGpuTimeBegin();
1598:   /* First, reorder with the row permutation */
1599:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1600:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1601:                tempGPU->begin());

1603:   /* Next, solve L */
1604:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1605:                         loTriFactor->csrMat->num_rows,
1606:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1607:                         loTriFactor->csrMat->num_entries,
1608:                       #endif
1609:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1610:                         loTriFactor->csrMat->values->data().get(),
1611:                         loTriFactor->csrMat->row_offsets->data().get(),
1612:                         loTriFactor->csrMat->column_indices->data().get(),
1613:                         loTriFactor->solveInfo,
1614:                         tempGPU->data().get(),
1615:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1616:                          xarray,
1617:                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1618:                       #else
1619:                          xarray);CHKERRCUSPARSE(stat);
1620:                       #endif

1622:   /* Then, solve U */
1623:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1624:                         upTriFactor->csrMat->num_rows,
1625:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626:                         upTriFactor->csrMat->num_entries,
1627:                       #endif
1628:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1629:                         upTriFactor->csrMat->values->data().get(),
1630:                         upTriFactor->csrMat->row_offsets->data().get(),
1631:                         upTriFactor->csrMat->column_indices->data().get(),
1632:                         upTriFactor->solveInfo,xarray,
1633:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634:                         tempGPU->data().get(),
1635:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1636:                       #else
1637:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1638:                       #endif

1640:   /* Last, reorder with the column permutation */
1641:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1642:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1643:                xGPU);

1645:   VecCUDARestoreArrayRead(bb,&barray);
1646:   VecCUDARestoreArrayWrite(xx,&xarray);
1647:   PetscLogGpuTimeEnd();
1648:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1649:   return(0);
1650: }

1652: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1653: {
1654:   const PetscScalar                 *barray;
1655:   PetscScalar                       *xarray;
1656:   cusparseStatus_t                  stat;
1657:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1658:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1659:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1660:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1661:   PetscErrorCode                    ierr;

1664:   /* Get the GPU pointers */
1665:   VecCUDAGetArrayWrite(xx,&xarray);
1666:   VecCUDAGetArrayRead(bb,&barray);

1668:   PetscLogGpuTimeBegin();
1669:   /* First, solve L */
1670:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1671:                         loTriFactor->csrMat->num_rows,
1672:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673:                         loTriFactor->csrMat->num_entries,
1674:                       #endif
1675:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1676:                         loTriFactor->csrMat->values->data().get(),
1677:                         loTriFactor->csrMat->row_offsets->data().get(),
1678:                         loTriFactor->csrMat->column_indices->data().get(),
1679:                         loTriFactor->solveInfo,
1680:                         barray,
1681:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1682:                         tempGPU->data().get(),
1683:                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1684:                       #else
1685:                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1686:                       #endif

1688:   /* Next, solve U */
1689:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1690:                         upTriFactor->csrMat->num_rows,
1691:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1692:                         upTriFactor->csrMat->num_entries,
1693:                       #endif
1694:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1695:                         upTriFactor->csrMat->values->data().get(),
1696:                         upTriFactor->csrMat->row_offsets->data().get(),
1697:                         upTriFactor->csrMat->column_indices->data().get(),
1698:                         upTriFactor->solveInfo,
1699:                         tempGPU->data().get(),
1700:                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1701:                         xarray,
1702:                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1703:                       #else
1704:                         xarray);CHKERRCUSPARSE(stat);
1705:                       #endif

1707:   VecCUDARestoreArrayRead(bb,&barray);
1708:   VecCUDARestoreArrayWrite(xx,&xarray);
1709:   PetscLogGpuTimeEnd();
1710:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1711:   return(0);
1712: }

1714: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1715: {
1716:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1717:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1718:   cudaError_t        cerr;
1719:   PetscErrorCode     ierr;

1722:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1723:     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;

1725:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1726:     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1727:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1728:     PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1729:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1730:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1731:   }
1732:   return(0);
1733: }

1735: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1736: {
1737:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

1741:   MatSeqAIJCUSPARSECopyFromGPU(A);
1742:   *array = a->a;
1743:   A->offloadmask = PETSC_OFFLOAD_CPU;
1744:   return(0);
1745: }

1747: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1748: {
1749:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1750:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1751:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1752:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1753:   PetscErrorCode               ierr;
1754:   cusparseStatus_t             stat;
1755:   PetscBool                    both = PETSC_TRUE;
1756:   cudaError_t                  err;

1759:   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1760:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1761:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1762:       CsrMatrix *matrix;
1763:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1765:       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1766:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1767:       matrix->values->assign(a->a, a->a+a->nz);
1768:       err  = WaitForCUDA();CHKERRCUDA(err);
1769:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1770:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1771:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1772:     } else {
1773:       PetscInt nnz;
1774:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1775:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1776:       MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1777:       delete cusparsestruct->workVector;
1778:       delete cusparsestruct->rowoffsets_gpu;
1779:       cusparsestruct->workVector = NULL;
1780:       cusparsestruct->rowoffsets_gpu = NULL;
1781:       try {
1782:         if (a->compressedrow.use) {
1783:           m    = a->compressedrow.nrows;
1784:           ii   = a->compressedrow.i;
1785:           ridx = a->compressedrow.rindex;
1786:         } else {
1787:           m    = A->rmap->n;
1788:           ii   = a->i;
1789:           ridx = NULL;
1790:         }
1791:         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1792:         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1793:         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1794:         else nnz = a->nz;

1796:         /* create cusparse matrix */
1797:         cusparsestruct->nrows = m;
1798:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1799:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1800:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1801:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1803:         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1804:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1805:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1806:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1807:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1808:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1809:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1811:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1812:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1813:           /* set the matrix */
1814:           CsrMatrix *mat= new CsrMatrix;
1815:           mat->num_rows = m;
1816:           mat->num_cols = A->cmap->n;
1817:           mat->num_entries = nnz;
1818:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819:           mat->row_offsets->assign(ii, ii + m+1);

1821:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1822:           mat->column_indices->assign(a->j, a->j+nnz);

1824:           mat->values = new THRUSTARRAY(nnz);
1825:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1827:           /* assign the pointer */
1828:           matstruct->mat = mat;
1829:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1830:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1831:             stat = cusparseCreateCsr(&matstruct->matDescr,
1832:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1833:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1834:                                     mat->values->data().get(),
1835:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1836:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1837:           }
1838:          #endif
1839:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1840:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1841:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1842:          #else
1843:           CsrMatrix *mat= new CsrMatrix;
1844:           mat->num_rows = m;
1845:           mat->num_cols = A->cmap->n;
1846:           mat->num_entries = nnz;
1847:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1848:           mat->row_offsets->assign(ii, ii + m+1);

1850:           mat->column_indices = new THRUSTINTARRAY32(nnz);
1851:           mat->column_indices->assign(a->j, a->j+nnz);

1853:           mat->values = new THRUSTARRAY(nnz);
1854:           if (a->a) mat->values->assign(a->a, a->a+nnz);

1856:           cusparseHybMat_t hybMat;
1857:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1858:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1859:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1860:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1861:               matstruct->descr, mat->values->data().get(),
1862:               mat->row_offsets->data().get(),
1863:               mat->column_indices->data().get(),
1864:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1865:           /* assign the pointer */
1866:           matstruct->mat = hybMat;

1868:           if (mat) {
1869:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1870:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1871:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1872:             delete (CsrMatrix*)mat;
1873:           }
1874:          #endif
1875:         }

1877:         /* assign the compressed row indices */
1878:         if (a->compressedrow.use) {
1879:           cusparsestruct->workVector = new THRUSTARRAY(m);
1880:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1881:           matstruct->cprowIndices->assign(ridx,ridx+m);
1882:           tmp = m;
1883:         } else {
1884:           cusparsestruct->workVector = NULL;
1885:           matstruct->cprowIndices    = NULL;
1886:           tmp = 0;
1887:         }
1888:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1890:         /* assign the pointer */
1891:         cusparsestruct->mat = matstruct;
1892:       } catch(char *ex) {
1893:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1894:       }
1895:       err  = WaitForCUDA();CHKERRCUDA(err);
1896:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1897:       cusparsestruct->nonzerostate = A->nonzerostate;
1898:     }
1899:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1900:   }
1901:   return(0);
1902: }

1904: struct VecCUDAPlusEquals
1905: {
1906:   template <typename Tuple>
1907:   __host__ __device__
1908:   void operator()(Tuple t)
1909:   {
1910:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1911:   }
1912: };

1914: struct VecCUDAEquals
1915: {
1916:   template <typename Tuple>
1917:   __host__ __device__
1918:   void operator()(Tuple t)
1919:   {
1920:     thrust::get<1>(t) = thrust::get<0>(t);
1921:   }
1922: };

1924: struct VecCUDAEqualsReverse
1925: {
1926:   template <typename Tuple>
1927:   __host__ __device__
1928:   void operator()(Tuple t)
1929:   {
1930:     thrust::get<0>(t) = thrust::get<1>(t);
1931:   }
1932: };

1934: struct MatMatCusparse {
1935:   PetscBool             cisdense;
1936:   PetscScalar           *Bt;
1937:   Mat                   X;
1938:   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1939:   PetscLogDouble        flops;
1940:   CsrMatrix             *Bcsr;

1942: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1943:   cusparseSpMatDescr_t  matSpBDescr;
1944:   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1945:   cusparseDnMatDescr_t  matBDescr;
1946:   cusparseDnMatDescr_t  matCDescr;
1947:   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1948:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1949:   void                  *dBuffer4;
1950:   void                  *dBuffer5;
1951:  #endif
1952:   size_t                mmBufferSize;
1953:   void                  *mmBuffer;
1954:   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1955:   cusparseSpGEMMDescr_t spgemmDesc;
1956: #endif
1957: };

1959: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1960: {
1961:   PetscErrorCode   ierr;
1962:   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1963:   cudaError_t      cerr;
1964:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1965:   cusparseStatus_t stat;
1966:  #endif

1969:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1970:   delete mmdata->Bcsr;
1971:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1972:   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1973:   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1974:   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1975:   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1976:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1977:   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1978:   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1979:  #endif
1980:   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1981:   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1982:  #endif
1983:   MatDestroy(&mmdata->X);
1984:   PetscFree(data);
1985:   return(0);
1986: }

1988: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1990: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1991: {
1992:   Mat_Product                  *product = C->product;
1993:   Mat                          A,B;
1994:   PetscInt                     m,n,blda,clda;
1995:   PetscBool                    flg,biscuda;
1996:   Mat_SeqAIJCUSPARSE           *cusp;
1997:   cusparseStatus_t             stat;
1998:   cusparseOperation_t          opA;
1999:   const PetscScalar            *barray;
2000:   PetscScalar                  *carray;
2001:   PetscErrorCode               ierr;
2002:   MatMatCusparse               *mmdata;
2003:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2004:   CsrMatrix                    *csrmat;

2007:   MatCheckProduct(C,1);
2008:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2009:   mmdata = (MatMatCusparse*)product->data;
2010:   A    = product->A;
2011:   B    = product->B;
2012:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2013:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2014:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2015:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2016:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2017:   MatSeqAIJCUSPARSECopyToGPU(A);
2018:   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2019:   switch (product->type) {
2020:   case MATPRODUCT_AB:
2021:   case MATPRODUCT_PtAP:
2022:     mat = cusp->mat;
2023:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2024:     m   = A->rmap->n;
2025:     n   = B->cmap->n;
2026:     break;
2027:   case MATPRODUCT_AtB:
2028:     if (!A->form_explicit_transpose) {
2029:       mat = cusp->mat;
2030:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2031:     } else {
2032:       MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2033:       mat  = cusp->matTranspose;
2034:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2035:     }
2036:     m = A->cmap->n;
2037:     n = B->cmap->n;
2038:     break;
2039:   case MATPRODUCT_ABt:
2040:   case MATPRODUCT_RARt:
2041:     mat = cusp->mat;
2042:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2043:     m   = A->rmap->n;
2044:     n   = B->rmap->n;
2045:     break;
2046:   default:
2047:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2048:   }
2049:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2050:   csrmat = (CsrMatrix*)mat->mat;
2051:   /* if the user passed a CPU matrix, copy the data to the GPU */
2052:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2053:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2054:   MatDenseCUDAGetArrayRead(B,&barray);

2056:   MatDenseGetLDA(B,&blda);
2057:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2058:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2059:     MatDenseGetLDA(mmdata->X,&clda);
2060:   } else {
2061:     MatDenseCUDAGetArrayWrite(C,&carray);
2062:     MatDenseGetLDA(C,&clda);
2063:   }

2065:   PetscLogGpuTimeBegin();
2066:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2067:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2068:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2069:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2070:     size_t mmBufferSize;
2071:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2072:     if (!mmdata->matBDescr) {
2073:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2074:       mmdata->Blda = blda;
2075:     }

2077:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2078:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2079:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2080:       mmdata->Clda = clda;
2081:     }

2083:     if (!mat->matDescr) {
2084:       stat = cusparseCreateCsr(&mat->matDescr,
2085:                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2086:                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2087:                                csrmat->values->data().get(),
2088:                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2089:                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2090:     }
2091:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2092:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2093:                                    mmdata->matCDescr,cusparse_scalartype,
2094:                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2095:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2096:       cudaError_t cerr;
2097:       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2098:       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2099:       mmdata->mmBufferSize = mmBufferSize;
2100:     }
2101:     mmdata->initialized = PETSC_TRUE;
2102:   } else {
2103:     /* to be safe, always update pointers of the mats */
2104:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2105:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2106:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2107:   }

2109:   /* do cusparseSpMM, which supports transpose on B */
2110:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2111:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2112:                       mmdata->matCDescr,cusparse_scalartype,
2113:                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2114:  #else
2115:   PetscInt k;
2116:   /* cusparseXcsrmm does not support transpose on B */
2117:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2118:     cublasHandle_t cublasv2handle;
2119:     cublasStatus_t cerr;

2121:     PetscCUBLASGetHandle(&cublasv2handle);
2122:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2123:                        B->cmap->n,B->rmap->n,
2124:                        &PETSC_CUSPARSE_ONE ,barray,blda,
2125:                        &PETSC_CUSPARSE_ZERO,barray,blda,
2126:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2127:     blda = B->cmap->n;
2128:     k    = B->cmap->n;
2129:   } else {
2130:     k    = B->rmap->n;
2131:   }

2133:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2134:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2135:                            csrmat->num_entries,mat->alpha_one,mat->descr,
2136:                            csrmat->values->data().get(),
2137:                            csrmat->row_offsets->data().get(),
2138:                            csrmat->column_indices->data().get(),
2139:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2140:                            carray,clda);CHKERRCUSPARSE(stat);
2141:  #endif
2142:   PetscLogGpuTimeEnd();
2143:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2144:   MatDenseCUDARestoreArrayRead(B,&barray);
2145:   if (product->type == MATPRODUCT_RARt) {
2146:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2147:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2148:   } else if (product->type == MATPRODUCT_PtAP) {
2149:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2150:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2151:   } else {
2152:     MatDenseCUDARestoreArrayWrite(C,&carray);
2153:   }
2154:   if (mmdata->cisdense) {
2155:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2156:   }
2157:   if (!biscuda) {
2158:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2159:   }
2160:   return(0);
2161: }

2163: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2164: {
2165:   Mat_Product        *product = C->product;
2166:   Mat                A,B;
2167:   PetscInt           m,n;
2168:   PetscBool          cisdense,flg;
2169:   PetscErrorCode     ierr;
2170:   MatMatCusparse     *mmdata;
2171:   Mat_SeqAIJCUSPARSE *cusp;

2174:   MatCheckProduct(C,1);
2175:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2176:   A    = product->A;
2177:   B    = product->B;
2178:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2179:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2180:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2181:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2182:   switch (product->type) {
2183:   case MATPRODUCT_AB:
2184:     m = A->rmap->n;
2185:     n = B->cmap->n;
2186:     break;
2187:   case MATPRODUCT_AtB:
2188:     m = A->cmap->n;
2189:     n = B->cmap->n;
2190:     break;
2191:   case MATPRODUCT_ABt:
2192:     m = A->rmap->n;
2193:     n = B->rmap->n;
2194:     break;
2195:   case MATPRODUCT_PtAP:
2196:     m = B->cmap->n;
2197:     n = B->cmap->n;
2198:     break;
2199:   case MATPRODUCT_RARt:
2200:     m = B->rmap->n;
2201:     n = B->rmap->n;
2202:     break;
2203:   default:
2204:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2205:   }
2206:   MatSetSizes(C,m,n,m,n);
2207:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2208:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2209:   MatSetType(C,MATSEQDENSECUDA);

2211:   /* product data */
2212:   PetscNew(&mmdata);
2213:   mmdata->cisdense = cisdense;
2214:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2215:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2216:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2217:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2218:   }
2219:  #endif
2220:   /* for these products we need intermediate storage */
2221:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2222:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2223:     MatSetType(mmdata->X,MATSEQDENSECUDA);
2224:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2225:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2226:     } else {
2227:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2228:     }
2229:   }
2230:   C->product->data    = mmdata;
2231:   C->product->destroy = MatDestroy_MatMatCusparse;

2233:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2234:   return(0);
2235: }

2237: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2238: {
2239:   Mat_Product                  *product = C->product;
2240:   Mat                          A,B;
2241:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2242:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2243:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2244:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2245:   PetscBool                    flg;
2246:   PetscErrorCode               ierr;
2247:   cusparseStatus_t             stat;
2248:   cudaError_t                  cerr;
2249:   MatProductType               ptype;
2250:   MatMatCusparse               *mmdata;
2251: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2252:   cusparseSpMatDescr_t         BmatSpDescr;
2253: #endif
2254:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2257:   MatCheckProduct(C,1);
2258:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2259:   PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2260:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2261:   mmdata = (MatMatCusparse*)C->product->data;
2262:   A = product->A;
2263:   B = product->B;
2264:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2265:     mmdata->reusesym = PETSC_FALSE;
2266:     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2267:     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2268:     Cmat = Ccusp->mat;
2269:     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2270:     Ccsr = (CsrMatrix*)Cmat->mat;
2271:     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2272:     goto finalize;
2273:   }
2274:   if (!c->nz) goto finalize;
2275:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2276:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2277:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2278:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2279:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2280:   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2281:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2282:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2283:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2284:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2285:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2286:   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2287:   MatSeqAIJCUSPARSECopyToGPU(A);
2288:   MatSeqAIJCUSPARSECopyToGPU(B);

2290:   ptype = product->type;
2291:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2292:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2293:   switch (ptype) {
2294:   case MATPRODUCT_AB:
2295:     Amat = Acusp->mat;
2296:     Bmat = Bcusp->mat;
2297:     break;
2298:   case MATPRODUCT_AtB:
2299:     Amat = Acusp->matTranspose;
2300:     Bmat = Bcusp->mat;
2301:     break;
2302:   case MATPRODUCT_ABt:
2303:     Amat = Acusp->mat;
2304:     Bmat = Bcusp->matTranspose;
2305:     break;
2306:   default:
2307:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2308:   }
2309:   Cmat = Ccusp->mat;
2310:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2311:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2312:   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2313:   Acsr = (CsrMatrix*)Amat->mat;
2314:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2315:   Ccsr = (CsrMatrix*)Cmat->mat;
2316:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2317:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2318:   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2319:   PetscLogGpuTimeBegin();
2320: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2321:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2322:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2323:   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2324:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2325:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2326:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2327:                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2328:   #else
2329:     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2330:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2331:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2332:                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2333:     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2334:                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2335:                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2336:   #endif
2337: #else
2338:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2339:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2340:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2341:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2342:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2343: #endif
2344:   PetscLogGpuFlops(mmdata->flops);
2345:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2346:   PetscLogGpuTimeEnd();
2347:   C->offloadmask = PETSC_OFFLOAD_GPU;
2348: finalize:
2349:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2350:   PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2351:   PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2352:   PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2353:   c->reallocs         = 0;
2354:   C->info.mallocs    += 0;
2355:   C->info.nz_unneeded = 0;
2356:   C->assembled = C->was_assembled = PETSC_TRUE;
2357:   C->num_ass++;
2358:   return(0);
2359: }

2361: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2362: {
2363:   Mat_Product                  *product = C->product;
2364:   Mat                          A,B;
2365:   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2366:   Mat_SeqAIJ                   *a,*b,*c;
2367:   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2368:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2369:   PetscInt                     i,j,m,n,k;
2370:   PetscBool                    flg;
2371:   PetscErrorCode               ierr;
2372:   cusparseStatus_t             stat;
2373:   cudaError_t                  cerr;
2374:   MatProductType               ptype;
2375:   MatMatCusparse               *mmdata;
2376:   PetscLogDouble               flops;
2377:   PetscBool                    biscompressed,ciscompressed;
2378: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2379:   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2380:   cusparseSpMatDescr_t         BmatSpDescr;
2381: #else
2382:   int                          cnz;
2383: #endif
2384:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2387:   MatCheckProduct(C,1);
2388:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2389:   A    = product->A;
2390:   B    = product->B;
2391:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2392:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393:   PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2394:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395:   a = (Mat_SeqAIJ*)A->data;
2396:   b = (Mat_SeqAIJ*)B->data;
2397:   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398:   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399:   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2400:   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");

2402:   /* product data */
2403:   PetscNew(&mmdata);
2404:   C->product->data    = mmdata;
2405:   C->product->destroy = MatDestroy_MatMatCusparse;

2407:   MatSeqAIJCUSPARSECopyToGPU(A);
2408:   MatSeqAIJCUSPARSECopyToGPU(B);
2409:   ptype = product->type;
2410:   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2411:   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2412:   biscompressed = PETSC_FALSE;
2413:   ciscompressed = PETSC_FALSE;
2414:   switch (ptype) {
2415:   case MATPRODUCT_AB:
2416:     m = A->rmap->n;
2417:     n = B->cmap->n;
2418:     k = A->cmap->n;
2419:     Amat = Acusp->mat;
2420:     Bmat = Bcusp->mat;
2421:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2422:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2423:     break;
2424:   case MATPRODUCT_AtB:
2425:     m = A->cmap->n;
2426:     n = B->cmap->n;
2427:     k = A->rmap->n;
2428:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2429:     Amat = Acusp->matTranspose;
2430:     Bmat = Bcusp->mat;
2431:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2432:     break;
2433:   case MATPRODUCT_ABt:
2434:     m = A->rmap->n;
2435:     n = B->rmap->n;
2436:     k = A->cmap->n;
2437:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2438:     Amat = Acusp->mat;
2439:     Bmat = Bcusp->matTranspose;
2440:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2441:     break;
2442:   default:
2443:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2444:   }

2446:   /* create cusparse matrix */
2447:   MatSetSizes(C,m,n,m,n);
2448:   MatSetType(C,MATSEQAIJCUSPARSE);
2449:   c     = (Mat_SeqAIJ*)C->data;
2450:   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2451:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2452:   Ccsr  = new CsrMatrix;

2454:   c->compressedrow.use = ciscompressed;
2455:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2456:     c->compressedrow.nrows = a->compressedrow.nrows;
2457:     PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2458:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2459:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2460:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2461:     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2462:   } else {
2463:     c->compressedrow.nrows  = 0;
2464:     c->compressedrow.i      = NULL;
2465:     c->compressedrow.rindex = NULL;
2466:     Ccusp->workVector       = NULL;
2467:     Cmat->cprowIndices      = NULL;
2468:   }
2469:   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2470:   Ccusp->mat      = Cmat;
2471:   Ccusp->mat->mat = Ccsr;
2472:   Ccsr->num_rows    = Ccusp->nrows;
2473:   Ccsr->num_cols    = n;
2474:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2475:   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2476:   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2477:   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2478:   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2479:   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2480:   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2481:   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2482:   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2483:   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2484:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2485:     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2486:     c->nz = 0;
2487:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2488:     Ccsr->values = new THRUSTARRAY(c->nz);
2489:     goto finalizesym;
2490:   }

2492:   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2493:   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2494:   Acsr = (CsrMatrix*)Amat->mat;
2495:   if (!biscompressed) {
2496:     Bcsr = (CsrMatrix*)Bmat->mat;
2497: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2498:     BmatSpDescr = Bmat->matDescr;
2499: #endif
2500:   } else { /* we need to use row offsets for the full matrix */
2501:     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2502:     Bcsr = new CsrMatrix;
2503:     Bcsr->num_rows       = B->rmap->n;
2504:     Bcsr->num_cols       = cBcsr->num_cols;
2505:     Bcsr->num_entries    = cBcsr->num_entries;
2506:     Bcsr->column_indices = cBcsr->column_indices;
2507:     Bcsr->values         = cBcsr->values;
2508:     if (!Bcusp->rowoffsets_gpu) {
2509:       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2510:       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2511:       PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2512:     }
2513:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2514:     mmdata->Bcsr = Bcsr;
2515: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2516:     if (Bcsr->num_rows && Bcsr->num_cols) {
2517:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2518:                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2519:                                Bcsr->values->data().get(),
2520:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2521:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2522:     }
2523:     BmatSpDescr = mmdata->matSpBDescr;
2524: #endif
2525:   }
2526:   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2527:   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2528:   /* precompute flops count */
2529:   if (ptype == MATPRODUCT_AB) {
2530:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2531:       const PetscInt st = a->i[i];
2532:       const PetscInt en = a->i[i+1];
2533:       for (j=st; j<en; j++) {
2534:         const PetscInt brow = a->j[j];
2535:         flops += 2.*(b->i[brow+1] - b->i[brow]);
2536:       }
2537:     }
2538:   } else if (ptype == MATPRODUCT_AtB) {
2539:     for (i=0, flops = 0; i<A->rmap->n; i++) {
2540:       const PetscInt anzi = a->i[i+1] - a->i[i];
2541:       const PetscInt bnzi = b->i[i+1] - b->i[i];
2542:       flops += (2.*anzi)*bnzi;
2543:     }
2544:   } else { /* TODO */
2545:     flops = 0.;
2546:   }

2548:   mmdata->flops = flops;
2549:   PetscLogGpuTimeBegin();

2551: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2552:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2553:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2554:                           NULL, NULL, NULL,
2555:                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2556:                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2557:   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2558:  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2559:  {
2560:   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2561:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2562:   */
2563:   void*  dBuffer1 = NULL;
2564:   void*  dBuffer2 = NULL;
2565:   void*  dBuffer3 = NULL;
2566:   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2567:   size_t bufferSize1 = 0;
2568:   size_t bufferSize2 = 0;
2569:   size_t bufferSize3 = 0;
2570:   size_t bufferSize4 = 0;
2571:   size_t bufferSize5 = 0;

2573:   /*----------------------------------------------------------------------*/
2574:   /* ask bufferSize1 bytes for external memory */
2575:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2576:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2577:                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2578:   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2579:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2580:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2581:                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2582:                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);

2584:   /*----------------------------------------------------------------------*/
2585:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2586:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2587:                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2588:   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2589:   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2590:   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2591:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2592:                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2593:                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2594:   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2595:   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);

2597:   /*----------------------------------------------------------------------*/
2598:   /* get matrix C non-zero entries C_nnz1 */
2599:   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2600:   c->nz = (PetscInt) C_nnz1;
2601:   /* allocate matrix C */
2602:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2603:   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2604:   /* update matC with the new pointers */
2605:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2606:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);

2608:   /*----------------------------------------------------------------------*/
2609:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2610:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2611:                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2612:   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2613:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614:                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615:                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2616:   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2617:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2618:                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2619:                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2620:                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2621:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2622:  }
2623:  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2624:   size_t bufSize2;
2625:   /* ask bufferSize bytes for external memory */
2626:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2627:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2628:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2629:                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2630:   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2631:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2632:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2633:                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2634:                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2635:                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2636:   /* ask bufferSize again bytes for external memory */
2637:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2638:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2639:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2640:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2641:   /* The CUSPARSE documentation is not clear, nor the API
2642:      We need both buffers to perform the operations properly!
2643:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2644:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2645:      is stored in the descriptor! What a messy API... */
2646:   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2647:   /* compute the intermediate product of A * B */
2648:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2649:                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2650:                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2651:                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2652:   /* get matrix C non-zero entries C_nnz1 */
2653:   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2654:   c->nz = (PetscInt) C_nnz1;
2655:   PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2656:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2657:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2658:   Ccsr->values = new THRUSTARRAY(c->nz);
2659:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2660:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2661:                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2662:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2663:                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2664:                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2665:  #endif
2666: #else
2667:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2668:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2669:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2670:                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2671:                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2672:                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2673:   c->nz = cnz;
2674:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2675:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2676:   Ccsr->values = new THRUSTARRAY(c->nz);
2677:   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

2679:   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2680:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2681:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2682:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2683:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2684:                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2685:                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2686:                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2687:                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2688: #endif
2689:   PetscLogGpuFlops(mmdata->flops);
2690:   PetscLogGpuTimeEnd();
2691: finalizesym:
2692:   c->singlemalloc = PETSC_FALSE;
2693:   c->free_a       = PETSC_TRUE;
2694:   c->free_ij      = PETSC_TRUE;
2695:   PetscMalloc1(m+1,&c->i);
2696:   PetscMalloc1(c->nz,&c->j);
2697:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2698:     PetscInt *d_i = c->i;
2699:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2700:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2701:     ii   = *Ccsr->row_offsets;
2702:     jj   = *Ccsr->column_indices;
2703:     if (ciscompressed) d_i = c->compressedrow.i;
2704:     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2705:     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2706:   } else {
2707:     PetscInt *d_i = c->i;
2708:     if (ciscompressed) d_i = c->compressedrow.i;
2709:     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2710:     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2711:   }
2712:   if (ciscompressed) { /* need to expand host row offsets */
2713:     PetscInt r = 0;
2714:     c->i[0] = 0;
2715:     for (k = 0; k < c->compressedrow.nrows; k++) {
2716:       const PetscInt next = c->compressedrow.rindex[k];
2717:       const PetscInt old = c->compressedrow.i[k];
2718:       for (; r < next; r++) c->i[r+1] = old;
2719:     }
2720:     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2721:   }
2722:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2723:   PetscMalloc1(m,&c->ilen);
2724:   PetscMalloc1(m,&c->imax);
2725:   c->maxnz = c->nz;
2726:   c->nonzerorowcnt = 0;
2727:   c->rmax = 0;
2728:   for (k = 0; k < m; k++) {
2729:     const PetscInt nn = c->i[k+1] - c->i[k];
2730:     c->ilen[k] = c->imax[k] = nn;
2731:     c->nonzerorowcnt += (PetscInt)!!nn;
2732:     c->rmax = PetscMax(c->rmax,nn);
2733:   }
2734:   MatMarkDiagonal_SeqAIJ(C);
2735:   PetscMalloc1(c->nz,&c->a);
2736:   Ccsr->num_entries = c->nz;

2738:   C->nonzerostate++;
2739:   PetscLayoutSetUp(C->rmap);
2740:   PetscLayoutSetUp(C->cmap);
2741:   Ccusp->nonzerostate = C->nonzerostate;
2742:   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2743:   C->preallocated  = PETSC_TRUE;
2744:   C->assembled     = PETSC_FALSE;
2745:   C->was_assembled = PETSC_FALSE;
2746:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2747:     mmdata->reusesym = PETSC_TRUE;
2748:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2749:   }
2750:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2751:   return(0);
2752: }

2754: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2756: /* handles sparse or dense B */
2757: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2758: {
2759:   Mat_Product    *product = mat->product;
2761:   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;

2764:   MatCheckProduct(mat,1);
2765:   PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2766:   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2767:     PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2768:   }
2769:   if (product->type == MATPRODUCT_ABC) {
2770:     Ciscusp = PETSC_FALSE;
2771:     if (!product->C->boundtocpu) {
2772:       PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2773:     }
2774:   }
2775:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2776:     PetscBool usecpu = PETSC_FALSE;
2777:     switch (product->type) {
2778:     case MATPRODUCT_AB:
2779:       if (product->api_user) {
2780:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2781:         PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2782:         PetscOptionsEnd();
2783:       } else {
2784:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2785:         PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2786:         PetscOptionsEnd();
2787:       }
2788:       break;
2789:     case MATPRODUCT_AtB:
2790:       if (product->api_user) {
2791:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2792:         PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2793:         PetscOptionsEnd();
2794:       } else {
2795:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2796:         PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2797:         PetscOptionsEnd();
2798:       }
2799:       break;
2800:     case MATPRODUCT_PtAP:
2801:       if (product->api_user) {
2802:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2803:         PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2804:         PetscOptionsEnd();
2805:       } else {
2806:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2807:         PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2808:         PetscOptionsEnd();
2809:       }
2810:       break;
2811:     case MATPRODUCT_RARt:
2812:       if (product->api_user) {
2813:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2814:         PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2815:         PetscOptionsEnd();
2816:       } else {
2817:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2818:         PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2819:         PetscOptionsEnd();
2820:       }
2821:       break;
2822:     case MATPRODUCT_ABC:
2823:       if (product->api_user) {
2824:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2825:         PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2826:         PetscOptionsEnd();
2827:       } else {
2828:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2829:         PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2830:         PetscOptionsEnd();
2831:       }
2832:       break;
2833:     default:
2834:       break;
2835:     }
2836:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2837:   }
2838:   /* dispatch */
2839:   if (isdense) {
2840:     switch (product->type) {
2841:     case MATPRODUCT_AB:
2842:     case MATPRODUCT_AtB:
2843:     case MATPRODUCT_ABt:
2844:     case MATPRODUCT_PtAP:
2845:     case MATPRODUCT_RARt:
2846:      if (product->A->boundtocpu) {
2847:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2848:       } else {
2849:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2850:       }
2851:       break;
2852:     case MATPRODUCT_ABC:
2853:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2854:       break;
2855:     default:
2856:       break;
2857:     }
2858:   } else if (Biscusp && Ciscusp) {
2859:     switch (product->type) {
2860:     case MATPRODUCT_AB:
2861:     case MATPRODUCT_AtB:
2862:     case MATPRODUCT_ABt:
2863:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2864:       break;
2865:     case MATPRODUCT_PtAP:
2866:     case MATPRODUCT_RARt:
2867:     case MATPRODUCT_ABC:
2868:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2869:       break;
2870:     default:
2871:       break;
2872:     }
2873:   } else { /* fallback for AIJ */
2874:     MatProductSetFromOptions_SeqAIJ(mat);
2875:   }
2876:   return(0);
2877: }

2879: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2880: {

2884:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2885:   return(0);
2886: }

2888: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2889: {

2893:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2894:   return(0);
2895: }

2897: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2898: {

2902:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2903:   return(0);
2904: }

2906: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2907: {

2911:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2912:   return(0);
2913: }

2915: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2916: {

2920:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2921:   return(0);
2922: }

2924: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2925: {
2926:   int i = blockIdx.x*blockDim.x + threadIdx.x;
2927:   if (i < n) y[idx[i]] += x[i];
2928: }

2930: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2931: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2932: {
2933:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2934:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2935:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2936:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2937:   PetscErrorCode               ierr;
2938:   cusparseStatus_t             stat;
2939:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2940:   PetscBool                    compressed;
2941: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2942:   PetscInt                     nx,ny;
2943: #endif

2946:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2947:   if (!a->nonzerorowcnt) {
2948:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2949:     else {VecCopy_SeqCUDA(yy,zz);}
2950:     return(0);
2951:   }
2952:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2953:   MatSeqAIJCUSPARSECopyToGPU(A);
2954:   if (!trans) {
2955:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2956:     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2957:   } else {
2958:     if (herm || !A->form_explicit_transpose) {
2959:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2960:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2961:     } else {
2962:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2963:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2964:     }
2965:   }
2966:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2967:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2969:   try {
2970:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2971:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2972:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2974:     PetscLogGpuTimeBegin();
2975:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2976:       /* z = A x + beta y.
2977:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2978:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2979:       */
2980:       xptr = xarray;
2981:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2982:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2983:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2984:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2985:           allocated to accommodate different uses. So we get the length info directly from mat.
2986:        */
2987:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2988:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2989:         nx = mat->num_cols;
2990:         ny = mat->num_rows;
2991:       }
2992:      #endif
2993:     } else {
2994:       /* z = A^T x + beta y
2995:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2996:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2997:        */
2998:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2999:       dptr = zarray;
3000:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3001:       if (compressed) { /* Scatter x to work vector */
3002:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3003:         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3004:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3005:                          VecCUDAEqualsReverse());
3006:       }
3007:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3008:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3009:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3010:         nx = mat->num_rows;
3011:         ny = mat->num_cols;
3012:       }
3013:      #endif
3014:     }

3016:     /* csr_spmv does y = alpha op(A) x + beta y */
3017:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3018:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3019:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3020:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3021:         cudaError_t cerr;
3022:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3023:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3024:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3025:                                 matstruct->matDescr,
3026:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3027:                                 matstruct->cuSpMV[opA].vecYDescr,
3028:                                 cusparse_scalartype,
3029:                                 cusparsestruct->spmvAlg,
3030:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3031:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

3033:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3034:       } else {
3035:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3036:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3037:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3038:       }

3040:       stat = cusparseSpMV(cusparsestruct->handle, opA,
3041:                                matstruct->alpha_one,
3042:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
3043:                                matstruct->cuSpMV[opA].vecXDescr,
3044:                                beta,
3045:                                matstruct->cuSpMV[opA].vecYDescr,
3046:                                cusparse_scalartype,
3047:                                cusparsestruct->spmvAlg,
3048:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3049:      #else
3050:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3051:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3052:                                mat->num_rows, mat->num_cols,
3053:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3054:                                mat->values->data().get(), mat->row_offsets->data().get(),
3055:                                mat->column_indices->data().get(), xptr, beta,
3056:                                dptr);CHKERRCUSPARSE(stat);
3057:      #endif
3058:     } else {
3059:       if (cusparsestruct->nrows) {
3060:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3061:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3062:        #else
3063:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3064:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3065:                                  matstruct->alpha_one, matstruct->descr, hybMat,
3066:                                  xptr, beta,
3067:                                  dptr);CHKERRCUSPARSE(stat);
3068:        #endif
3069:       }
3070:     }
3071:     PetscLogGpuTimeEnd();

3073:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3074:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3075:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3076:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3077:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3078:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3079:         }
3080:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3081:         VecSet_SeqCUDA(zz,0);
3082:       }

3084:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3085:       if (compressed) {
3086:         PetscLogGpuTimeBegin();
3087:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3088:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3089:            prevent that. So I just add a ScatterAdd kernel.
3090:          */
3091:        #if 0
3092:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3093:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3094:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3095:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3096:                          VecCUDAPlusEquals());
3097:        #else
3098:         PetscInt n = matstruct->cprowIndices->size();
3099:         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3100:        #endif
3101:         PetscLogGpuTimeEnd();
3102:       }
3103:     } else {
3104:       if (yy && yy != zz) {
3105:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3106:       }
3107:     }
3108:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3109:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3110:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
3111:   } catch(char *ex) {
3112:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3113:   }
3114:   if (yy) {
3115:     PetscLogGpuFlops(2.0*a->nz);
3116:   } else {
3117:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3118:   }
3119:   return(0);
3120: }

3122: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3123: {

3127:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3128:   return(0);
3129: }

3131: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3132: {
3133:   PetscErrorCode     ierr;
3134:   PetscObjectState   onnz = A->nonzerostate;
3135:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

3138:   MatAssemblyEnd_SeqAIJ(A,mode);
3139:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3140:     cudaError_t cerr;

3142:     PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3143:     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3144:     cusp->deviceMat = NULL;
3145:   }
3146:   return(0);
3147: }

3149: /* --------------------------------------------------------------------------------*/
3150: /*@
3151:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3152:    (the default parallel PETSc format). This matrix will ultimately pushed down
3153:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3154:    assembly performance the user should preallocate the matrix storage by setting
3155:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3156:    performance during matrix assembly can be increased by more than a factor of 50.

3158:    Collective

3160:    Input Parameters:
3161: +  comm - MPI communicator, set to PETSC_COMM_SELF
3162: .  m - number of rows
3163: .  n - number of columns
3164: .  nz - number of nonzeros per row (same for all rows)
3165: -  nnz - array containing the number of nonzeros in the various rows
3166:          (possibly different for each row) or NULL

3168:    Output Parameter:
3169: .  A - the matrix

3171:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3172:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3173:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3175:    Notes:
3176:    If nnz is given then nz is ignored

3178:    The AIJ format (also called the Yale sparse matrix format or
3179:    compressed row storage), is fully compatible with standard Fortran 77
3180:    storage.  That is, the stored row and column indices can begin at
3181:    either one (as in Fortran) or zero.  See the users' manual for details.

3183:    Specify the preallocated storage with either nz or nnz (not both).
3184:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3185:    allocation.  For large problems you MUST preallocate memory or you
3186:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3188:    By default, this format uses inodes (identical nodes) when possible, to
3189:    improve numerical efficiency of matrix-vector products and solves. We
3190:    search for consecutive rows with the same nonzero structure, thereby
3191:    reusing matrix information to achieve increased efficiency.

3193:    Level: intermediate

3195: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3196: @*/
3197: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3198: {

3202:   MatCreate(comm,A);
3203:   MatSetSizes(*A,m,n,m,n);
3204:   MatSetType(*A,MATSEQAIJCUSPARSE);
3205:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3206:   return(0);
3207: }

3209: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3210: {

3214:   if (A->factortype == MAT_FACTOR_NONE) {
3215:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3216:   } else {
3217:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3218:   }
3219:   PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3220:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3221:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3222:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3223:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3224:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3225:   PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3226:   PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3227:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3228:   MatDestroy_SeqAIJ(A);
3229:   return(0);
3230: }

3232: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3233: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3234: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3235: {

3239:   MatDuplicate_SeqAIJ(A,cpvalues,B);
3240:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3241:   return(0);
3242: }

3244: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3245: {
3246:   PetscErrorCode     ierr;
3247:   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3248:   Mat_SeqAIJCUSPARSE *cy;
3249:   Mat_SeqAIJCUSPARSE *cx;
3250:   PetscScalar        *ay;
3251:   const PetscScalar  *ax;
3252:   CsrMatrix          *csry,*csrx;

3255:   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3256:   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3257:   if (X->ops->axpy != Y->ops->axpy) {
3258:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3259:     MatAXPY_SeqAIJ(Y,a,X,str);
3260:     return(0);
3261:   }
3262:   /* if we are here, it means both matrices are bound to GPU */
3263:   MatSeqAIJCUSPARSECopyToGPU(Y);
3264:   MatSeqAIJCUSPARSECopyToGPU(X);
3265:   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3266:   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3267:   csry = (CsrMatrix*)cy->mat->mat;
3268:   csrx = (CsrMatrix*)cx->mat->mat;
3269:   /* see if we can turn this into a cublas axpy */
3270:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3271:     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3272:     if (eq) {
3273:       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3274:     }
3275:     if (eq) str = SAME_NONZERO_PATTERN;
3276:   }
3277:   /* spgeam is buggy with one column */
3278:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3280:   if (str == SUBSET_NONZERO_PATTERN) {
3281:     cusparseStatus_t stat;
3282:     PetscScalar      b = 1.0;
3283: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3284:     size_t           bufferSize;
3285:     void             *buffer;
3286:     cudaError_t      cerr;
3287: #endif

3289:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3290:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3291:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3292: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3293:     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3294:                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3295:                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3296:                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3297:     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3298:     PetscLogGpuTimeBegin();
3299:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3300:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3301:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3302:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3303:     PetscLogGpuFlops(x->nz + y->nz);
3304:     PetscLogGpuTimeEnd();
3305:     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3306: #else
3307:     PetscLogGpuTimeBegin();
3308:     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3309:                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3310:                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3311:                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3312:     PetscLogGpuFlops(x->nz + y->nz);
3313:     PetscLogGpuTimeEnd();
3314: #endif
3315:     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3316:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3317:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3318:     MatSeqAIJInvalidateDiagonal(Y);
3319:   } else if (str == SAME_NONZERO_PATTERN) {
3320:     cublasHandle_t cublasv2handle;
3321:     cublasStatus_t berr;
3322:     PetscBLASInt   one = 1, bnz = 1;

3324:     MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3325:     MatSeqAIJCUSPARSEGetArray(Y,&ay);
3326:     PetscCUBLASGetHandle(&cublasv2handle);
3327:     PetscBLASIntCast(x->nz,&bnz);
3328:     PetscLogGpuTimeBegin();
3329:     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3330:     PetscLogGpuFlops(2.0*bnz);
3331:     PetscLogGpuTimeEnd();
3332:     MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3333:     MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3334:     MatSeqAIJInvalidateDiagonal(Y);
3335:   } else {
3336:     MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3337:     MatAXPY_SeqAIJ(Y,a,X,str);
3338:   }
3339:   return(0);
3340: }

3342: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3343: {
3345:   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3346:   PetscScalar    *ay;
3347:   cublasHandle_t cublasv2handle;
3348:   cublasStatus_t berr;
3349:   PetscBLASInt   one = 1, bnz = 1;

3352:   MatSeqAIJCUSPARSEGetArray(Y,&ay);
3353:   PetscCUBLASGetHandle(&cublasv2handle);
3354:   PetscBLASIntCast(y->nz,&bnz);
3355:   PetscLogGpuTimeBegin();
3356:   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3357:   PetscLogGpuFlops(bnz);
3358:   PetscLogGpuTimeEnd();
3359:   MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3360:   MatSeqAIJInvalidateDiagonal(Y);
3361:   return(0);
3362: }

3364: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3365: {
3367:   PetscBool      both = PETSC_FALSE;
3368:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3371:   if (A->factortype == MAT_FACTOR_NONE) {
3372:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3373:     if (spptr->mat) {
3374:       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3375:       if (matrix->values) {
3376:         both = PETSC_TRUE;
3377:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3378:       }
3379:     }
3380:     if (spptr->matTranspose) {
3381:       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3382:       if (matrix->values) {
3383:         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3384:       }
3385:     }
3386:   }
3387:   //MatZeroEntries_SeqAIJ(A);
3388:   PetscArrayzero(a->a,a->i[A->rmap->n]);
3389:   MatSeqAIJInvalidateDiagonal(A);
3390:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3391:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3392:   return(0);
3393: }

3395: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3396: {
3397:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

3401:   if (A->factortype != MAT_FACTOR_NONE) return(0);
3402:   if (flg) {
3403:     MatSeqAIJCUSPARSECopyFromGPU(A);

3405:     A->ops->scale                     = MatScale_SeqAIJ;
3406:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3407:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3408:     A->ops->mult                      = MatMult_SeqAIJ;
3409:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3410:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3411:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3412:     A->ops->multhermitiantranspose    = NULL;
3413:     A->ops->multhermitiantransposeadd = NULL;
3414:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3415:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3416:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3417:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3418:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3419:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3420:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3421:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3422:   } else {
3423:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3424:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3425:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3426:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3427:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3428:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3429:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3430:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3431:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3432:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3433:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3434:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3435:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3436:     PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3437:     PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3438:     PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3439:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3440:   }
3441:   A->boundtocpu = flg;
3442:   a->inode.use = flg;
3443:   return(0);
3444: }

3446: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3447: {
3448:   PetscErrorCode   ierr;
3449:   cusparseStatus_t stat;
3450:   Mat              B;

3453:   PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3454:   if (reuse == MAT_INITIAL_MATRIX) {
3455:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
3456:   } else if (reuse == MAT_REUSE_MATRIX) {
3457:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3458:   }
3459:   B = *newmat;

3461:   PetscFree(B->defaultvectype);
3462:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

3464:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3465:     if (B->factortype == MAT_FACTOR_NONE) {
3466:       Mat_SeqAIJCUSPARSE *spptr;
3467:       PetscNew(&spptr);
3468:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3469:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3470:       spptr->format     = MAT_CUSPARSE_CSR;
3471:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472:      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3473:       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3474:      #else
3475:       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3476:      #endif
3477:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3478:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3479:      #endif
3480:       B->spptr = spptr;
3481:     } else {
3482:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3484:       PetscNew(&spptr);
3485:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3486:       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3487:       B->spptr = spptr;
3488:     }
3489:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3490:   }
3491:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3492:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3493:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3494:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3495:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3496:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3498:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3499:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3500:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3501: #if defined(PETSC_HAVE_HYPRE)
3502:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3503: #endif
3504:   return(0);
3505: }

3507: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3508: {

3512:   MatCreate_SeqAIJ(B);
3513:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3514:   return(0);
3515: }

3517: /*MC
3518:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3520:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3521:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3522:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

3524:    Options Database Keys:
3525: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3526: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3527: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

3529:   Level: beginner

3531: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3532: M*/

3534: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);

3536: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3537: {

3541:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3542:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3543:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3544:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3545:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);

3547:   return(0);
3548: }

3550: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3551: {
3552:   PetscErrorCode   ierr;
3553:   cusparseStatus_t stat;

3556:   if (*cusparsestruct) {
3557:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3558:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3559:     delete (*cusparsestruct)->workVector;
3560:     delete (*cusparsestruct)->rowoffsets_gpu;
3561:     delete (*cusparsestruct)->cooPerm;
3562:     delete (*cusparsestruct)->cooPerm_a;
3563:     delete (*cusparsestruct)->csr2csc_i;
3564:     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3565:     PetscFree(*cusparsestruct);
3566:   }
3567:   return(0);
3568: }

3570: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571: {
3573:   if (*mat) {
3574:     delete (*mat)->values;
3575:     delete (*mat)->column_indices;
3576:     delete (*mat)->row_offsets;
3577:     delete *mat;
3578:     *mat = 0;
3579:   }
3580:   return(0);
3581: }

3583: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3584: {
3585:   cusparseStatus_t stat;
3586:   PetscErrorCode   ierr;

3589:   if (*trifactor) {
3590:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3591:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3592:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3593:     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3594:     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3595:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3596:     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3597:    #endif
3598:     PetscFree(*trifactor);
3599:   }
3600:   return(0);
3601: }

3603: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3604: {
3605:   CsrMatrix        *mat;
3606:   cusparseStatus_t stat;
3607:   cudaError_t      err;

3610:   if (*matstruct) {
3611:     if ((*matstruct)->mat) {
3612:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3613:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3614:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3615:        #else
3616:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3617:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3618:        #endif
3619:       } else {
3620:         mat = (CsrMatrix*)(*matstruct)->mat;
3621:         CsrMatrix_Destroy(&mat);
3622:       }
3623:     }
3624:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3625:     delete (*matstruct)->cprowIndices;
3626:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3627:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3628:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

3630:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3631:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3632:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3633:     for (int i=0; i<3; i++) {
3634:       if (mdata->cuSpMV[i].initialized) {
3635:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3636:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3637:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3638:       }
3639:     }
3640:    #endif
3641:     delete *matstruct;
3642:     *matstruct = NULL;
3643:   }
3644:   return(0);
3645: }

3647: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3648: {

3652:   if (*trifactors) {
3653:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3654:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3655:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3656:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3657:     delete (*trifactors)->rpermIndices;
3658:     delete (*trifactors)->cpermIndices;
3659:     delete (*trifactors)->workVector;
3660:     (*trifactors)->rpermIndices = NULL;
3661:     (*trifactors)->cpermIndices = NULL;
3662:     (*trifactors)->workVector = NULL;
3663:     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3664:     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3665:     (*trifactors)->init_dev_prop = PETSC_FALSE;
3666:   }
3667:   return(0);
3668: }

3670: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3671: {
3672:   PetscErrorCode   ierr;
3673:   cusparseHandle_t handle;
3674:   cusparseStatus_t stat;

3677:   if (*trifactors) {
3678:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3679:     if (handle = (*trifactors)->handle) {
3680:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3681:     }
3682:     PetscFree(*trifactors);
3683:   }
3684:   return(0);
3685: }

3687: struct IJCompare
3688: {
3689:   __host__ __device__
3690:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3691:   {
3692:     if (t1.get<0>() < t2.get<0>()) return true;
3693:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3694:     return false;
3695:   }
3696: };

3698: struct IJEqual
3699: {
3700:   __host__ __device__
3701:   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3702:   {
3703:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3704:     return true;
3705:   }
3706: };

3708: struct IJDiff
3709: {
3710:   __host__ __device__
3711:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3712:   {
3713:     return t1 == t2 ? 0 : 1;
3714:   }
3715: };

3717: struct IJSum
3718: {
3719:   __host__ __device__
3720:   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3721:   {
3722:     return t1||t2;
3723:   }
3724: };

3726: #include <thrust/iterator/discard_iterator.h>
3727: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3728: {
3729:   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3730:   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3731:   THRUSTARRAY                           *cooPerm_v = NULL;
3732:   thrust::device_ptr<const PetscScalar> d_v;
3733:   CsrMatrix                             *matrix;
3734:   PetscErrorCode                        ierr;
3735:   PetscInt                              n;

3738:   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3739:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3740:   if (!cusp->cooPerm) {
3741:     MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3742:     MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3743:     return(0);
3744:   }
3745:   matrix = (CsrMatrix*)cusp->mat->mat;
3746:   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3747:   if (!v) {
3748:     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3749:     goto finalize;
3750:   }
3751:   n = cusp->cooPerm->size();
3752:   if (isCudaMem(v)) {
3753:     d_v = thrust::device_pointer_cast(v);
3754:   } else {
3755:     cooPerm_v = new THRUSTARRAY(n);
3756:     cooPerm_v->assign(v,v+n);
3757:     d_v = cooPerm_v->data();
3758:     PetscLogCpuToGpu(n*sizeof(PetscScalar));
3759:   }
3760:   PetscLogGpuTimeBegin();
3761:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3762:     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3763:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3764:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3765:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3766:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3767:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3768:       */
3769:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3770:       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3771:       delete cooPerm_w;
3772:     } else {
3773:       /* all nonzeros in d_v[] are unique entries */
3774:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3775:                                                                 matrix->values->begin()));
3776:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3777:                                                                 matrix->values->end()));
3778:       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3779:     }
3780:   } else {
3781:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3782:       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3783:       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3784:     } else {
3785:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3786:                                                                 matrix->values->begin()));
3787:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3788:                                                                 matrix->values->end()));
3789:       thrust::for_each(zibit,zieit,VecCUDAEquals());
3790:     }
3791:   }
3792:   PetscLogGpuTimeEnd();
3793: finalize:
3794:   delete cooPerm_v;
3795:   A->offloadmask = PETSC_OFFLOAD_GPU;
3796:   PetscObjectStateIncrease((PetscObject)A);
3797:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3798:   PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3799:   PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3800:   PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3801:   a->reallocs         = 0;
3802:   A->info.mallocs    += 0;
3803:   A->info.nz_unneeded = 0;
3804:   A->assembled = A->was_assembled = PETSC_TRUE;
3805:   A->num_ass++;
3806:   return(0);
3807: }

3809: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3810: {
3811:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3812:   PetscErrorCode     ierr;

3816:   if (!cusp) return(0);
3817:   if (destroy) {
3818:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3819:     delete cusp->csr2csc_i;
3820:     cusp->csr2csc_i = NULL;
3821:   }
3822:   A->transupdated = PETSC_FALSE;
3823:   return(0);
3824: }

3826: #include <thrust/binary_search.h>
3827: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3828: {
3829:   PetscErrorCode     ierr;
3830:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3832:   PetscInt           cooPerm_n, nzr = 0;
3833:   cudaError_t        cerr;

3836:   PetscLayoutSetUp(A->rmap);
3837:   PetscLayoutSetUp(A->cmap);
3838:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3839:   if (n != cooPerm_n) {
3840:     delete cusp->cooPerm;
3841:     delete cusp->cooPerm_a;
3842:     cusp->cooPerm = NULL;
3843:     cusp->cooPerm_a = NULL;
3844:   }
3845:   if (n) {
3846:     THRUSTINTARRAY d_i(n);
3847:     THRUSTINTARRAY d_j(n);
3848:     THRUSTINTARRAY ii(A->rmap->n);

3850:     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3851:     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }

3853:     PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3854:     d_i.assign(coo_i,coo_i+n);
3855:     d_j.assign(coo_j,coo_j+n);

3857:     /* Ex.
3858:       n = 6
3859:       coo_i = [3,3,1,4,1,4]
3860:       coo_j = [3,2,2,5,2,6]
3861:     */
3862:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3863:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));

3865:     PetscLogGpuTimeBegin();
3866:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3867:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3868:     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3869:     THRUSTINTARRAY w = d_j;

3871:     /*
3872:       d_i     = [1,1,3,3,4,4]
3873:       d_j     = [2,2,2,3,5,6]
3874:       cooPerm = [2,4,1,0,3,5]
3875:     */
3876:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

3878:     /*
3879:       d_i     = [1,3,3,4,4,x]
3880:                             ^ekey
3881:       d_j     = [2,2,3,5,6,x]
3882:                            ^nekye
3883:     */
3884:     if (nekey == ekey) { /* all entries are unique */
3885:       delete cusp->cooPerm_a;
3886:       cusp->cooPerm_a = NULL;
3887:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3888:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3889:       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3890:       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3891:       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3892:       w[0] = 0;
3893:       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3894:       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3895:     }
3896:     thrust::counting_iterator<PetscInt> search_begin(0);
3897:     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3898:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3899:                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3900:     PetscLogGpuTimeEnd();

3902:     MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3903:     a->singlemalloc = PETSC_FALSE;
3904:     a->free_a       = PETSC_TRUE;
3905:     a->free_ij      = PETSC_TRUE;
3906:     PetscMalloc1(A->rmap->n+1,&a->i);
3907:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3908:     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3909:     a->nz = a->maxnz = a->i[A->rmap->n];
3910:     a->rmax = 0;
3911:     PetscMalloc1(a->nz,&a->a);
3912:     PetscMalloc1(a->nz,&a->j);
3913:     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3914:     if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3915:     if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3916:     for (PetscInt i = 0; i < A->rmap->n; i++) {
3917:       const PetscInt nnzr = a->i[i+1] - a->i[i];
3918:       nzr += (PetscInt)!!(nnzr);
3919:       a->ilen[i] = a->imax[i] = nnzr;
3920:       a->rmax = PetscMax(a->rmax,nnzr);
3921:     }
3922:     a->nonzerorowcnt = nzr;
3923:     A->preallocated = PETSC_TRUE;
3924:     PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3925:     MatMarkDiagonal_SeqAIJ(A);
3926:   } else {
3927:     MatSeqAIJSetPreallocation(A,0,NULL);
3928:   }
3929:   MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);

3931:   /* We want to allocate the CUSPARSE struct for matvec now.
3932:      The code is so convoluted now that I prefer to copy zeros */
3933:   PetscArrayzero(a->a,a->nz);
3934:   MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3935:   A->offloadmask = PETSC_OFFLOAD_CPU;
3936:   A->nonzerostate++;
3937:   MatSeqAIJCUSPARSECopyToGPU(A);
3938:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);

3940:   A->assembled = PETSC_FALSE;
3941:   A->was_assembled = PETSC_FALSE;
3942:   return(0);
3943: }

3945: /*@C
3946:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.

3948:    Not collective

3950:     Input Parameters:
3951: +   A - the matrix
3952: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

3954:     Output Parameters:
3955: +   ia - the CSR row pointers
3956: -   ja - the CSR column indices

3958:     Level: developer

3960:     Notes:
3961:       When compressed is true, the CSR structure does not contain empty rows

3963: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3964: @*/
3965: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3966: {
3967:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3968:   CsrMatrix          *csr;
3969:   PetscErrorCode     ierr;
3970:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;

3974:   if (!i || !j) return(0);
3976:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3977:   MatSeqAIJCUSPARSECopyToGPU(A);
3978:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3979:   csr = (CsrMatrix*)cusp->mat->mat;
3980:   if (i) {
3981:     if (!compressed && a->compressedrow.use) { /* need full row offset */
3982:       if (!cusp->rowoffsets_gpu) {
3983:         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3984:         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3985:         PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
3986:       }
3987:       *i = cusp->rowoffsets_gpu->data().get();
3988:     } else *i = csr->row_offsets->data().get();
3989:   }
3990:   if (j) *j = csr->column_indices->data().get();
3991:   return(0);
3992: }

3994: /*@C
3995:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()

3997:    Not collective

3999:     Input Parameters:
4000: +   A - the matrix
4001: -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form

4003:     Output Parameters:
4004: +   ia - the CSR row pointers
4005: -   ja - the CSR column indices

4007:     Level: developer

4009: .seealso: MatSeqAIJCUSPARSEGetIJ()
4010: @*/
4011: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4012: {
4016:   if (i) *i = NULL;
4017:   if (j) *j = NULL;
4018:   return(0);
4019: }

4021: /*@C
4022:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4024:    Not Collective

4026:    Input Parameter:
4027: .   A - a MATSEQAIJCUSPARSE matrix

4029:    Output Parameter:
4030: .   a - pointer to the device data

4032:    Level: developer

4034:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4036: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4037: @*/
4038: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4039: {
4040:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4041:   CsrMatrix          *csr;
4042:   PetscErrorCode     ierr;

4048:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4049:   MatSeqAIJCUSPARSECopyToGPU(A);
4050:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4051:   csr = (CsrMatrix*)cusp->mat->mat;
4052:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4053:   *a = csr->values->data().get();
4054:   return(0);
4055: }

4057: /*@C
4058:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()

4060:    Not Collective

4062:    Input Parameter:
4063: .   A - a MATSEQAIJCUSPARSE matrix

4065:    Output Parameter:
4066: .   a - pointer to the device data

4068:    Level: developer

4070: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4071: @*/
4072: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4073: {
4078:   *a = NULL;
4079:   return(0);
4080: }

4082: /*@C
4083:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4085:    Not Collective

4087:    Input Parameter:
4088: .   A - a MATSEQAIJCUSPARSE matrix

4090:    Output Parameter:
4091: .   a - pointer to the device data

4093:    Level: developer

4095:    Notes: may trigger host-device copies if up-to-date matrix data is on host

4097: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4098: @*/
4099: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4100: {
4101:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4102:   CsrMatrix          *csr;
4103:   PetscErrorCode     ierr;

4109:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4110:   MatSeqAIJCUSPARSECopyToGPU(A);
4111:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4112:   csr = (CsrMatrix*)cusp->mat->mat;
4113:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4114:   *a = csr->values->data().get();
4115:   A->offloadmask = PETSC_OFFLOAD_GPU;
4116:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4117:   return(0);
4118: }
4119: /*@C
4120:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()

4122:    Not Collective

4124:    Input Parameter:
4125: .   A - a MATSEQAIJCUSPARSE matrix

4127:    Output Parameter:
4128: .   a - pointer to the device data

4130:    Level: developer

4132: .seealso: MatSeqAIJCUSPARSEGetArray()
4133: @*/
4134: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4135: {

4142:   PetscObjectStateIncrease((PetscObject)A);
4143:   *a = NULL;
4144:   return(0);
4145: }

4147: /*@C
4148:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored

4150:    Not Collective

4152:    Input Parameter:
4153: .   A - a MATSEQAIJCUSPARSE matrix

4155:    Output Parameter:
4156: .   a - pointer to the device data

4158:    Level: developer

4160:    Notes: does not trigger host-device copies and flags data validity on the GPU

4162: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4163: @*/
4164: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4165: {
4166:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4167:   CsrMatrix          *csr;
4168:   PetscErrorCode     ierr;

4174:   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4175:   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4176:   csr = (CsrMatrix*)cusp->mat->mat;
4177:   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4178:   *a = csr->values->data().get();
4179:   A->offloadmask = PETSC_OFFLOAD_GPU;
4180:   MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4181:   return(0);
4182: }

4184: /*@C
4185:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()

4187:    Not Collective

4189:    Input Parameter:
4190: .   A - a MATSEQAIJCUSPARSE matrix

4192:    Output Parameter:
4193: .   a - pointer to the device data

4195:    Level: developer

4197: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4198: @*/
4199: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4200: {

4207:   PetscObjectStateIncrease((PetscObject)A);
4208:   *a = NULL;
4209:   return(0);
4210: }

4212: struct IJCompare4
4213: {
4214:   __host__ __device__
4215:   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4216:   {
4217:     if (t1.get<0>() < t2.get<0>()) return true;
4218:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4219:     return false;
4220:   }
4221: };

4223: struct Shift
4224: {
4225:   int _shift;

4227:   Shift(int shift) : _shift(shift) {}
4228:   __host__ __device__
4229:   inline int operator() (const int &c)
4230:   {
4231:     return c + _shift;
4232:   }
4233: };

4235: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4236: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4237: {
4238:   PetscErrorCode               ierr;
4239:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4240:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4241:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4242:   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4243:   PetscInt                     Annz,Bnnz;
4244:   cusparseStatus_t             stat;
4245:   PetscInt                     i,m,n,zero = 0;
4246:   cudaError_t                  cerr;

4254:   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4255:   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4256:   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4257:   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4258:   if (reuse == MAT_INITIAL_MATRIX) {
4259:     m     = A->rmap->n;
4260:     n     = A->cmap->n + B->cmap->n;
4261:     MatCreate(PETSC_COMM_SELF,C);
4262:     MatSetSizes(*C,m,n,m,n);
4263:     MatSetType(*C,MATSEQAIJCUSPARSE);
4264:     c     = (Mat_SeqAIJ*)(*C)->data;
4265:     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4266:     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4267:     Ccsr  = new CsrMatrix;
4268:     Cmat->cprowIndices      = NULL;
4269:     c->compressedrow.use    = PETSC_FALSE;
4270:     c->compressedrow.nrows  = 0;
4271:     c->compressedrow.i      = NULL;
4272:     c->compressedrow.rindex = NULL;
4273:     Ccusp->workVector       = NULL;
4274:     Ccusp->nrows    = m;
4275:     Ccusp->mat      = Cmat;
4276:     Ccusp->mat->mat = Ccsr;
4277:     Ccsr->num_rows  = m;
4278:     Ccsr->num_cols  = n;
4279:     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4280:     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4281:     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4282:     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4283:     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4284:     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4285:     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4286:     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4287:     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4288:     MatSeqAIJCUSPARSECopyToGPU(A);
4289:     MatSeqAIJCUSPARSECopyToGPU(B);
4290:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
4291:     MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
4292:     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4293:     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");

4295:     Acsr = (CsrMatrix*)Acusp->mat->mat;
4296:     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4297:     Annz = (PetscInt)Acsr->column_indices->size();
4298:     Bnnz = (PetscInt)Bcsr->column_indices->size();
4299:     c->nz = Annz + Bnnz;
4300:     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4301:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4302:     Ccsr->values = new THRUSTARRAY(c->nz);
4303:     Ccsr->num_entries = c->nz;
4304:     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4305:     if (c->nz) {
4306:       auto Acoo = new THRUSTINTARRAY32(Annz);
4307:       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4308:       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4309:       THRUSTINTARRAY32 *Aroff,*Broff;

4311:       if (a->compressedrow.use) { /* need full row offset */
4312:         if (!Acusp->rowoffsets_gpu) {
4313:           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4314:           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4315:           PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4316:         }
4317:         Aroff = Acusp->rowoffsets_gpu;
4318:       } else Aroff = Acsr->row_offsets;
4319:       if (b->compressedrow.use) { /* need full row offset */
4320:         if (!Bcusp->rowoffsets_gpu) {
4321:           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4322:           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4323:           PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4324:         }
4325:         Broff = Bcusp->rowoffsets_gpu;
4326:       } else Broff = Bcsr->row_offsets;
4327:       PetscLogGpuTimeBegin();
4328:       stat = cusparseXcsr2coo(Acusp->handle,
4329:                               Aroff->data().get(),
4330:                               Annz,
4331:                               m,
4332:                               Acoo->data().get(),
4333:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4334:       stat = cusparseXcsr2coo(Bcusp->handle,
4335:                               Broff->data().get(),
4336:                               Bnnz,
4337:                               m,
4338:                               Bcoo->data().get(),
4339:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4340:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4341:       auto Aperm = thrust::make_constant_iterator(1);
4342:       auto Bperm = thrust::make_constant_iterator(0);
4343: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4344:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4345:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4346: #else
4347:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4348:       auto Bcib = Bcsr->column_indices->begin();
4349:       auto Bcie = Bcsr->column_indices->end();
4350:       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4351: #endif
4352:       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4353:       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4354:       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4355:       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4356:       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4357:       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4358:       auto p1 = Ccusp->cooPerm->begin();
4359:       auto p2 = Ccusp->cooPerm->begin();
4360:       thrust::advance(p2,Annz);
4361:       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4362: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4363:       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4364: #endif
4365:       auto cci = thrust::make_counting_iterator(zero);
4366:       auto cce = thrust::make_counting_iterator(c->nz);
4367: #if 0 //Errors on SUMMIT cuda 11.1.0
4368:       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4369: #else
4370:       auto pred = thrust::identity<int>();
4371:       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4372:       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4373: #endif
4374:       stat = cusparseXcoo2csr(Ccusp->handle,
4375:                               Ccoo->data().get(),
4376:                               c->nz,
4377:                               m,
4378:                               Ccsr->row_offsets->data().get(),
4379:                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4380:       PetscLogGpuTimeEnd();
4381:       delete wPerm;
4382:       delete Acoo;
4383:       delete Bcoo;
4384:       delete Ccoo;
4385: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4386:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4387:                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4388:                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4389:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4390: #endif
4391:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4392:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4393:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4394:         CsrMatrix *CcsrT = new CsrMatrix;
4395:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4396:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;

4398:         (*C)->form_explicit_transpose = PETSC_TRUE;
4399:         (*C)->transupdated = PETSC_TRUE;
4400:         Ccusp->rowoffsets_gpu = NULL;
4401:         CmatT->cprowIndices = NULL;
4402:         CmatT->mat = CcsrT;
4403:         CcsrT->num_rows = n;
4404:         CcsrT->num_cols = m;
4405:         CcsrT->num_entries = c->nz;

4407:         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4408:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4409:         CcsrT->values = new THRUSTARRAY(c->nz);

4411:         PetscLogGpuTimeBegin();
4412:         auto rT = CcsrT->row_offsets->begin();
4413:         if (AT) {
4414:           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4415:           thrust::advance(rT,-1);
4416:         }
4417:         if (BT) {
4418:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4419:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4420:           thrust::copy(titb,tite,rT);
4421:         }
4422:         auto cT = CcsrT->column_indices->begin();
4423:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4424:         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4425:         auto vT = CcsrT->values->begin();
4426:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4427:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4428:         PetscLogGpuTimeEnd();

4430:         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4431:         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4432:         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4433:         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4434:         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4435:         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4436:         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4437:         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4438:         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4439: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4440:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4441:                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4442:                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4443:                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4444: #endif
4445:         Ccusp->matTranspose = CmatT;
4446:       }
4447:     }

4449:     c->singlemalloc = PETSC_FALSE;
4450:     c->free_a       = PETSC_TRUE;
4451:     c->free_ij      = PETSC_TRUE;
4452:     PetscMalloc1(m+1,&c->i);
4453:     PetscMalloc1(c->nz,&c->j);
4454:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4455:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4456:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4457:       ii   = *Ccsr->row_offsets;
4458:       jj   = *Ccsr->column_indices;
4459:       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4460:       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4461:     } else {
4462:       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4463:       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4464:     }
4465:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4466:     PetscMalloc1(m,&c->ilen);
4467:     PetscMalloc1(m,&c->imax);
4468:     c->maxnz = c->nz;
4469:     c->nonzerorowcnt = 0;
4470:     c->rmax = 0;
4471:     for (i = 0; i < m; i++) {
4472:       const PetscInt nn = c->i[i+1] - c->i[i];
4473:       c->ilen[i] = c->imax[i] = nn;
4474:       c->nonzerorowcnt += (PetscInt)!!nn;
4475:       c->rmax = PetscMax(c->rmax,nn);
4476:     }
4477:     MatMarkDiagonal_SeqAIJ(*C);
4478:     PetscMalloc1(c->nz,&c->a);
4479:     (*C)->nonzerostate++;
4480:     PetscLayoutSetUp((*C)->rmap);
4481:     PetscLayoutSetUp((*C)->cmap);
4482:     Ccusp->nonzerostate = (*C)->nonzerostate;
4483:     (*C)->preallocated  = PETSC_TRUE;
4484:   } else {
4485:     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4486:     c = (Mat_SeqAIJ*)(*C)->data;
4487:     if (c->nz) {
4488:       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4489:       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4490:       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4491:       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4492:       MatSeqAIJCUSPARSECopyToGPU(A);
4493:       MatSeqAIJCUSPARSECopyToGPU(B);
4494:       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4495:       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4496:       Acsr = (CsrMatrix*)Acusp->mat->mat;
4497:       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4498:       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4499:       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4500:       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4501:       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4502:       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4503:       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4504:       auto pmid = Ccusp->cooPerm->begin();
4505:       thrust::advance(pmid,Acsr->num_entries);
4506:       PetscLogGpuTimeBegin();
4507:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4508:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4509:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4510:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4511:       thrust::for_each(zibait,zieait,VecCUDAEquals());
4512:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4513:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4514:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4515:                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4516:       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4517:       MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4518:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4519:         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4520:         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4521:         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4522:         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4523:         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4524:         auto vT = CcsrT->values->begin();
4525:         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4526:         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4527:         (*C)->transupdated = PETSC_TRUE;
4528:       }
4529:       PetscLogGpuTimeEnd();
4530:     }
4531:   }
4532:   PetscObjectStateIncrease((PetscObject)*C);
4533:   (*C)->assembled     = PETSC_TRUE;
4534:   (*C)->was_assembled = PETSC_FALSE;
4535:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4536:   return(0);
4537: }

4539: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4540: {
4541:   PetscErrorCode    ierr;
4542:   bool              dmem;
4543:   const PetscScalar *av;
4544:   cudaError_t       cerr;

4547:   dmem = isCudaMem(v);
4548:   MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4549:   if (n && idx) {
4550:     THRUSTINTARRAY widx(n);
4551:     widx.assign(idx,idx+n);
4552:     PetscLogCpuToGpu(n*sizeof(PetscInt));

4554:     THRUSTARRAY *w = NULL;
4555:     thrust::device_ptr<PetscScalar> dv;
4556:     if (dmem) {
4557:       dv = thrust::device_pointer_cast(v);
4558:     } else {
4559:       w = new THRUSTARRAY(n);
4560:       dv = w->data();
4561:     }
4562:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4564:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4565:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4566:     thrust::for_each(zibit,zieit,VecCUDAEquals());
4567:     if (w) {
4568:       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4569:     }
4570:     delete w;
4571:   } else {
4572:     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4573:   }
4574:   if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4575:   MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4576:   return(0);
4577: }