Actual source code: baij.c


  2: /*
  3:     Defines the basic matrix operations for the BAIJ (compressed row)
  4:   matrix storage format.
  5: */
  6: #include <../src/mat/impls/baij/seq/baij.h>
  7: #include <petscblaslapack.h>
  8: #include <petsc/private/kernels/blockinvert.h>
  9: #include <petsc/private/kernels/blockmatmult.h>

 11: #if defined(PETSC_HAVE_HYPRE)
 12: PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat,MatType,MatReuse,Mat*);
 13: #endif

 15: #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
 16: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat,MatType,MatReuse,Mat*);
 17: #endif
 18: PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat,MatType,MatReuse,Mat*);

 20: PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A,PetscInt type,PetscReal *reductions)
 21: {
 23:   Mat_SeqBAIJ    *a_aij = (Mat_SeqBAIJ*) A->data;
 24:   PetscInt       m,n,i;
 25:   PetscInt       ib,jb,bs = A->rmap->bs;
 26:   MatScalar      *a_val = a_aij->a;

 29:   MatGetSize(A,&m,&n);
 30:   for (i=0; i<n; i++) reductions[i] = 0.0;
 31:   if (type == NORM_2) {
 32:     for (i=a_aij->i[0]; i<a_aij->i[A->rmap->n/bs]; i++) {
 33:       for (jb=0; jb<bs; jb++) {
 34:         for (ib=0; ib<bs; ib++) {
 35:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
 36:           a_val++;
 37:         }
 38:       }
 39:     }
 40:   } else if (type == NORM_1) {
 41:     for (i=a_aij->i[0]; i<a_aij->i[A->rmap->n/bs]; i++) {
 42:       for (jb=0; jb<bs; jb++) {
 43:         for (ib=0; ib<bs; ib++) {
 44:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
 45:           a_val++;
 46:         }
 47:       }
 48:     }
 49:   } else if (type == NORM_INFINITY) {
 50:     for (i=a_aij->i[0]; i<a_aij->i[A->rmap->n/bs]; i++) {
 51:       for (jb=0; jb<bs; jb++) {
 52:         for (ib=0; ib<bs; ib++) {
 53:           int col = A->cmap->rstart + a_aij->j[i] * bs + jb;
 54:           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
 55:           a_val++;
 56:         }
 57:       }
 58:     }
 59:   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
 60:     for (i=a_aij->i[0]; i<a_aij->i[A->rmap->n/bs]; i++) {
 61:       for (jb=0; jb<bs; jb++) {
 62:         for (ib=0; ib<bs; ib++) {
 63:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
 64:           a_val++;
 65:         }
 66:       }
 67:     }
 68:   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 69:     for (i=a_aij->i[0]; i<a_aij->i[A->rmap->n/bs]; i++) {
 70:       for (jb=0; jb<bs; jb++) {
 71:         for (ib=0; ib<bs; ib++) {
 72:           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
 73:           a_val++;
 74:         }
 75:       }
 76:     }
 77:   } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Unknown reduction type");
 78:   if (type == NORM_2) {
 79:     for (i=0; i<n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
 80:   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
 81:     for (i=0; i<n; i++) reductions[i] /= m;
 82:   }
 83:   return(0);
 84: }

 86: PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A,const PetscScalar **values)
 87: {
 88:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*) A->data;
 90:   PetscInt       *diag_offset,i,bs = A->rmap->bs,mbs = a->mbs,ipvt[5],bs2 = bs*bs,*v_pivots;
 91:   MatScalar      *v    = a->a,*odiag,*diag,work[25],*v_work;
 92:   PetscReal      shift = 0.0;
 93:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

 96:   allowzeropivot = PetscNot(A->erroriffailure);

 98:   if (a->idiagvalid) {
 99:     if (values) *values = a->idiag;
100:     return(0);
101:   }
102:   MatMarkDiagonal_SeqBAIJ(A);
103:   diag_offset = a->diag;
104:   if (!a->idiag) {
105:     PetscMalloc1(bs2*mbs,&a->idiag);
106:     PetscLogObjectMemory((PetscObject)A,bs2*mbs*sizeof(PetscScalar));
107:   }
108:   diag  = a->idiag;
109:   if (values) *values = a->idiag;
110:   /* factor and invert each block */
111:   switch (bs) {
112:   case 1:
113:     for (i=0; i<mbs; i++) {
114:       odiag    = v + 1*diag_offset[i];
115:       diag[0]  = odiag[0];

117:       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
118:         if (allowzeropivot) {
119:           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
120:           A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
121:           A->factorerror_zeropivot_row   = i;
122:           PetscInfo1(A,"Zero pivot, row %D\n",i);
123:         } else SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D pivot value %g tolerance %g",i,(double)PetscAbsScalar(diag[0]),(double)PETSC_MACHINE_EPSILON);
124:       }

126:       diag[0]  = (PetscScalar)1.0 / (diag[0] + shift);
127:       diag    += 1;
128:     }
129:     break;
130:   case 2:
131:     for (i=0; i<mbs; i++) {
132:       odiag    = v + 4*diag_offset[i];
133:       diag[0]  = odiag[0]; diag[1] = odiag[1]; diag[2] = odiag[2]; diag[3] = odiag[3];
134:       PetscKernel_A_gets_inverse_A_2(diag,shift,allowzeropivot,&zeropivotdetected);
135:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
136:       diag    += 4;
137:     }
138:     break;
139:   case 3:
140:     for (i=0; i<mbs; i++) {
141:       odiag    = v + 9*diag_offset[i];
142:       diag[0]  = odiag[0]; diag[1] = odiag[1]; diag[2] = odiag[2]; diag[3] = odiag[3];
143:       diag[4]  = odiag[4]; diag[5] = odiag[5]; diag[6] = odiag[6]; diag[7] = odiag[7];
144:       diag[8]  = odiag[8];
145:       PetscKernel_A_gets_inverse_A_3(diag,shift,allowzeropivot,&zeropivotdetected);
146:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
147:       diag    += 9;
148:     }
149:     break;
150:   case 4:
151:     for (i=0; i<mbs; i++) {
152:       odiag  = v + 16*diag_offset[i];
153:       PetscArraycpy(diag,odiag,16);
154:       PetscKernel_A_gets_inverse_A_4(diag,shift,allowzeropivot,&zeropivotdetected);
155:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
156:       diag  += 16;
157:     }
158:     break;
159:   case 5:
160:     for (i=0; i<mbs; i++) {
161:       odiag  = v + 25*diag_offset[i];
162:       PetscArraycpy(diag,odiag,25);
163:       PetscKernel_A_gets_inverse_A_5(diag,ipvt,work,shift,allowzeropivot,&zeropivotdetected);
164:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
165:       diag  += 25;
166:     }
167:     break;
168:   case 6:
169:     for (i=0; i<mbs; i++) {
170:       odiag  = v + 36*diag_offset[i];
171:       PetscArraycpy(diag,odiag,36);
172:       PetscKernel_A_gets_inverse_A_6(diag,shift,allowzeropivot,&zeropivotdetected);
173:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
174:       diag  += 36;
175:     }
176:     break;
177:   case 7:
178:     for (i=0; i<mbs; i++) {
179:       odiag  = v + 49*diag_offset[i];
180:       PetscArraycpy(diag,odiag,49);
181:       PetscKernel_A_gets_inverse_A_7(diag,shift,allowzeropivot,&zeropivotdetected);
182:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
183:       diag  += 49;
184:     }
185:     break;
186:   default:
187:     PetscMalloc2(bs,&v_work,bs,&v_pivots);
188:     for (i=0; i<mbs; i++) {
189:       odiag  = v + bs2*diag_offset[i];
190:       PetscArraycpy(diag,odiag,bs2);
191:       PetscKernel_A_gets_inverse_A(bs,diag,v_pivots,v_work,allowzeropivot,&zeropivotdetected);
192:       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
193:       diag  += bs2;
194:     }
195:     PetscFree2(v_work,v_pivots);
196:   }
197:   a->idiagvalid = PETSC_TRUE;
198:   return(0);
199: }

201: PetscErrorCode MatSOR_SeqBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
202: {
203:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
204:   PetscScalar       *x,*work,*w,*workt,*t;
205:   const MatScalar   *v,*aa = a->a, *idiag;
206:   const PetscScalar *b,*xb;
207:   PetscScalar       s[7], xw[7]={0}; /* avoid some compilers thinking xw is uninitialized */
208:   PetscErrorCode    ierr;
209:   PetscInt          m = a->mbs,i,i2,nz,bs = A->rmap->bs,bs2 = bs*bs,k,j,idx,it;
210:   const PetscInt    *diag,*ai = a->i,*aj = a->j,*vi;

213:   its = its*lits;
214:   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
215:   if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
216:   if (fshift) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for diagonal shift");
217:   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for non-trivial relaxation factor");
218:   if ((flag & SOR_APPLY_UPPER) || (flag & SOR_APPLY_LOWER)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for applying upper or lower triangular parts");

220:   if (!a->idiagvalid) {MatInvertBlockDiagonal(A,NULL);}

222:   if (!m) return(0);
223:   diag  = a->diag;
224:   idiag = a->idiag;
225:   k    = PetscMax(A->rmap->n,A->cmap->n);
226:   if (!a->mult_work) {
227:     PetscMalloc1(k+1,&a->mult_work);
228:   }
229:   if (!a->sor_workt) {
230:     PetscMalloc1(k,&a->sor_workt);
231:   }
232:   if (!a->sor_work) {
233:     PetscMalloc1(bs,&a->sor_work);
234:   }
235:   work = a->mult_work;
236:   t    = a->sor_workt;
237:   w    = a->sor_work;

239:   VecGetArray(xx,&x);
240:   VecGetArrayRead(bb,&b);

242:   if (flag & SOR_ZERO_INITIAL_GUESS) {
243:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
244:       switch (bs) {
245:       case 1:
246:         PetscKernel_v_gets_A_times_w_1(x,idiag,b);
247:         t[0] = b[0];
248:         i2     = 1;
249:         idiag += 1;
250:         for (i=1; i<m; i++) {
251:           v  = aa + ai[i];
252:           vi = aj + ai[i];
253:           nz = diag[i] - ai[i];
254:           s[0] = b[i2];
255:           for (j=0; j<nz; j++) {
256:             xw[0] = x[vi[j]];
257:             PetscKernel_v_gets_v_minus_A_times_w_1(s,(v+j),xw);
258:           }
259:           t[i2] = s[0];
260:           PetscKernel_v_gets_A_times_w_1(xw,idiag,s);
261:           x[i2]  = xw[0];
262:           idiag += 1;
263:           i2    += 1;
264:         }
265:         break;
266:       case 2:
267:         PetscKernel_v_gets_A_times_w_2(x,idiag,b);
268:         t[0] = b[0]; t[1] = b[1];
269:         i2     = 2;
270:         idiag += 4;
271:         for (i=1; i<m; i++) {
272:           v  = aa + 4*ai[i];
273:           vi = aj + ai[i];
274:           nz = diag[i] - ai[i];
275:           s[0] = b[i2]; s[1] = b[i2+1];
276:           for (j=0; j<nz; j++) {
277:             idx = 2*vi[j];
278:             it  = 4*j;
279:             xw[0] = x[idx]; xw[1] = x[1+idx];
280:             PetscKernel_v_gets_v_minus_A_times_w_2(s,(v+it),xw);
281:           }
282:           t[i2] = s[0]; t[i2+1] = s[1];
283:           PetscKernel_v_gets_A_times_w_2(xw,idiag,s);
284:           x[i2]   = xw[0]; x[i2+1] = xw[1];
285:           idiag  += 4;
286:           i2     += 2;
287:         }
288:         break;
289:       case 3:
290:         PetscKernel_v_gets_A_times_w_3(x,idiag,b);
291:         t[0] = b[0]; t[1] = b[1]; t[2] = b[2];
292:         i2     = 3;
293:         idiag += 9;
294:         for (i=1; i<m; i++) {
295:           v  = aa + 9*ai[i];
296:           vi = aj + ai[i];
297:           nz = diag[i] - ai[i];
298:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2];
299:           while (nz--) {
300:             idx = 3*(*vi++);
301:             xw[0] = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx];
302:             PetscKernel_v_gets_v_minus_A_times_w_3(s,v,xw);
303:             v  += 9;
304:           }
305:           t[i2] = s[0]; t[i2+1] = s[1]; t[i2+2] = s[2];
306:           PetscKernel_v_gets_A_times_w_3(xw,idiag,s);
307:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
308:           idiag  += 9;
309:           i2     += 3;
310:         }
311:         break;
312:       case 4:
313:         PetscKernel_v_gets_A_times_w_4(x,idiag,b);
314:         t[0] = b[0]; t[1] = b[1]; t[2] = b[2]; t[3] = b[3];
315:         i2     = 4;
316:         idiag += 16;
317:         for (i=1; i<m; i++) {
318:           v  = aa + 16*ai[i];
319:           vi = aj + ai[i];
320:           nz = diag[i] - ai[i];
321:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3];
322:           while (nz--) {
323:             idx = 4*(*vi++);
324:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx];
325:             PetscKernel_v_gets_v_minus_A_times_w_4(s,v,xw);
326:             v  += 16;
327:           }
328:           t[i2] = s[0]; t[i2+1] = s[1]; t[i2+2] = s[2]; t[i2 + 3] = s[3];
329:           PetscKernel_v_gets_A_times_w_4(xw,idiag,s);
330:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3];
331:           idiag  += 16;
332:           i2     += 4;
333:         }
334:         break;
335:       case 5:
336:         PetscKernel_v_gets_A_times_w_5(x,idiag,b);
337:         t[0] = b[0]; t[1] = b[1]; t[2] = b[2]; t[3] = b[3]; t[4] = b[4];
338:         i2     = 5;
339:         idiag += 25;
340:         for (i=1; i<m; i++) {
341:           v  = aa + 25*ai[i];
342:           vi = aj + ai[i];
343:           nz = diag[i] - ai[i];
344:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4];
345:           while (nz--) {
346:             idx = 5*(*vi++);
347:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx]; xw[4] = x[4+idx];
348:             PetscKernel_v_gets_v_minus_A_times_w_5(s,v,xw);
349:             v  += 25;
350:           }
351:           t[i2] = s[0]; t[i2+1] = s[1]; t[i2+2] = s[2]; t[i2+3] = s[3]; t[i2+4] = s[4];
352:           PetscKernel_v_gets_A_times_w_5(xw,idiag,s);
353:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4];
354:           idiag  += 25;
355:           i2     += 5;
356:         }
357:         break;
358:       case 6:
359:         PetscKernel_v_gets_A_times_w_6(x,idiag,b);
360:         t[0] = b[0]; t[1] = b[1]; t[2] = b[2]; t[3] = b[3]; t[4] = b[4]; t[5] = b[5];
361:         i2     = 6;
362:         idiag += 36;
363:         for (i=1; i<m; i++) {
364:           v  = aa + 36*ai[i];
365:           vi = aj + ai[i];
366:           nz = diag[i] - ai[i];
367:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5];
368:           while (nz--) {
369:             idx = 6*(*vi++);
370:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
371:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx];
372:             PetscKernel_v_gets_v_minus_A_times_w_6(s,v,xw);
373:             v  += 36;
374:           }
375:           t[i2]   = s[0]; t[i2+1] = s[1]; t[i2+2] = s[2];
376:           t[i2+3] = s[3]; t[i2+4] = s[4]; t[i2+5] = s[5];
377:           PetscKernel_v_gets_A_times_w_6(xw,idiag,s);
378:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5];
379:           idiag  += 36;
380:           i2     += 6;
381:         }
382:         break;
383:       case 7:
384:         PetscKernel_v_gets_A_times_w_7(x,idiag,b);
385:         t[0] = b[0]; t[1] = b[1]; t[2] = b[2];
386:         t[3] = b[3]; t[4] = b[4]; t[5] = b[5]; t[6] = b[6];
387:         i2     = 7;
388:         idiag += 49;
389:         for (i=1; i<m; i++) {
390:           v  = aa + 49*ai[i];
391:           vi = aj + ai[i];
392:           nz = diag[i] - ai[i];
393:           s[0] = b[i2];   s[1] = b[i2+1]; s[2] = b[i2+2];
394:           s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5]; s[6] = b[i2+6];
395:           while (nz--) {
396:             idx = 7*(*vi++);
397:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
398:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx]; xw[6] = x[6+idx];
399:             PetscKernel_v_gets_v_minus_A_times_w_7(s,v,xw);
400:             v  += 49;
401:           }
402:           t[i2]   = s[0]; t[i2+1] = s[1]; t[i2+2] = s[2];
403:           t[i2+3] = s[3]; t[i2+4] = s[4]; t[i2+5] = s[5]; t[i2+6] = s[6];
404:           PetscKernel_v_gets_A_times_w_7(xw,idiag,s);
405:           x[i2] =   xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
406:           x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5]; x[i2+6] = xw[6];
407:           idiag  += 49;
408:           i2     += 7;
409:         }
410:         break;
411:       default:
412:         PetscKernel_w_gets_Ar_times_v(bs,bs,b,idiag,x);
413:         PetscArraycpy(t,b,bs);
414:         i2     = bs;
415:         idiag += bs2;
416:         for (i=1; i<m; i++) {
417:           v  = aa + bs2*ai[i];
418:           vi = aj + ai[i];
419:           nz = diag[i] - ai[i];

421:           PetscArraycpy(w,b+i2,bs);
422:           /* copy all rows of x that are needed into contiguous space */
423:           workt = work;
424:           for (j=0; j<nz; j++) {
425:             PetscArraycpy(workt,x + bs*(*vi++),bs);
426:             workt += bs;
427:           }
428:           PetscKernel_w_gets_w_minus_Ar_times_v(bs,bs*nz,w,v,work);
429:           PetscArraycpy(t+i2,w,bs);
430:           PetscKernel_w_gets_Ar_times_v(bs,bs,w,idiag,x+i2);

432:           idiag += bs2;
433:           i2    += bs;
434:         }
435:         break;
436:       }
437:       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
438:       PetscLogFlops(1.0*bs2*a->nz);
439:       xb = t;
440:     }
441:     else xb = b;
442:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
443:       idiag = a->idiag+bs2*(a->mbs-1);
444:       i2 = bs * (m-1);
445:       switch (bs) {
446:       case 1:
447:         s[0]  = xb[i2];
448:         PetscKernel_v_gets_A_times_w_1(xw,idiag,s);
449:         x[i2] = xw[0];
450:         i2   -= 1;
451:         for (i=m-2; i>=0; i--) {
452:           v  = aa + (diag[i]+1);
453:           vi = aj + diag[i] + 1;
454:           nz = ai[i+1] - diag[i] - 1;
455:           s[0] = xb[i2];
456:           for (j=0; j<nz; j++) {
457:             xw[0] = x[vi[j]];
458:             PetscKernel_v_gets_v_minus_A_times_w_1(s,(v+j),xw);
459:           }
460:           PetscKernel_v_gets_A_times_w_1(xw,idiag,s);
461:           x[i2]  = xw[0];
462:           idiag -= 1;
463:           i2    -= 1;
464:         }
465:         break;
466:       case 2:
467:         s[0]  = xb[i2]; s[1] = xb[i2+1];
468:         PetscKernel_v_gets_A_times_w_2(xw,idiag,s);
469:         x[i2] = xw[0]; x[i2+1] = xw[1];
470:         i2    -= 2;
471:         idiag -= 4;
472:         for (i=m-2; i>=0; i--) {
473:           v  = aa + 4*(diag[i] + 1);
474:           vi = aj + diag[i] + 1;
475:           nz = ai[i+1] - diag[i] - 1;
476:           s[0] = xb[i2]; s[1] = xb[i2+1];
477:           for (j=0; j<nz; j++) {
478:             idx = 2*vi[j];
479:             it  = 4*j;
480:             xw[0] = x[idx]; xw[1] = x[1+idx];
481:             PetscKernel_v_gets_v_minus_A_times_w_2(s,(v+it),xw);
482:           }
483:           PetscKernel_v_gets_A_times_w_2(xw,idiag,s);
484:           x[i2]   = xw[0]; x[i2+1] = xw[1];
485:           idiag  -= 4;
486:           i2     -= 2;
487:         }
488:         break;
489:       case 3:
490:         s[0]  = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2];
491:         PetscKernel_v_gets_A_times_w_3(xw,idiag,s);
492:         x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
493:         i2    -= 3;
494:         idiag -= 9;
495:         for (i=m-2; i>=0; i--) {
496:           v  = aa + 9*(diag[i]+1);
497:           vi = aj + diag[i] + 1;
498:           nz = ai[i+1] - diag[i] - 1;
499:           s[0] = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2];
500:           while (nz--) {
501:             idx = 3*(*vi++);
502:             xw[0] = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx];
503:             PetscKernel_v_gets_v_minus_A_times_w_3(s,v,xw);
504:             v  += 9;
505:           }
506:           PetscKernel_v_gets_A_times_w_3(xw,idiag,s);
507:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
508:           idiag  -= 9;
509:           i2     -= 3;
510:         }
511:         break;
512:       case 4:
513:         s[0]  = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3];
514:         PetscKernel_v_gets_A_times_w_4(xw,idiag,s);
515:         x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3];
516:         i2    -= 4;
517:         idiag -= 16;
518:         for (i=m-2; i>=0; i--) {
519:           v  = aa + 16*(diag[i]+1);
520:           vi = aj + diag[i] + 1;
521:           nz = ai[i+1] - diag[i] - 1;
522:           s[0] = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3];
523:           while (nz--) {
524:             idx = 4*(*vi++);
525:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx];
526:             PetscKernel_v_gets_v_minus_A_times_w_4(s,v,xw);
527:             v  += 16;
528:           }
529:           PetscKernel_v_gets_A_times_w_4(xw,idiag,s);
530:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3];
531:           idiag  -= 16;
532:           i2     -= 4;
533:         }
534:         break;
535:       case 5:
536:         s[0]  = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3]; s[4] = xb[i2+4];
537:         PetscKernel_v_gets_A_times_w_5(xw,idiag,s);
538:         x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4];
539:         i2    -= 5;
540:         idiag -= 25;
541:         for (i=m-2; i>=0; i--) {
542:           v  = aa + 25*(diag[i]+1);
543:           vi = aj + diag[i] + 1;
544:           nz = ai[i+1] - diag[i] - 1;
545:           s[0] = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3]; s[4] = xb[i2+4];
546:           while (nz--) {
547:             idx = 5*(*vi++);
548:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx]; xw[4] = x[4+idx];
549:             PetscKernel_v_gets_v_minus_A_times_w_5(s,v,xw);
550:             v  += 25;
551:           }
552:           PetscKernel_v_gets_A_times_w_5(xw,idiag,s);
553:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4];
554:           idiag  -= 25;
555:           i2     -= 5;
556:         }
557:         break;
558:       case 6:
559:         s[0]  = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3]; s[4] = xb[i2+4]; s[5] = xb[i2+5];
560:         PetscKernel_v_gets_A_times_w_6(xw,idiag,s);
561:         x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5];
562:         i2    -= 6;
563:         idiag -= 36;
564:         for (i=m-2; i>=0; i--) {
565:           v  = aa + 36*(diag[i]+1);
566:           vi = aj + diag[i] + 1;
567:           nz = ai[i+1] - diag[i] - 1;
568:           s[0] = xb[i2]; s[1] = xb[i2+1]; s[2] = xb[i2+2]; s[3] = xb[i2+3]; s[4] = xb[i2+4]; s[5] = xb[i2+5];
569:           while (nz--) {
570:             idx = 6*(*vi++);
571:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
572:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx];
573:             PetscKernel_v_gets_v_minus_A_times_w_6(s,v,xw);
574:             v  += 36;
575:           }
576:           PetscKernel_v_gets_A_times_w_6(xw,idiag,s);
577:           x[i2] = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2]; x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5];
578:           idiag  -= 36;
579:           i2     -= 6;
580:         }
581:         break;
582:       case 7:
583:         s[0] = xb[i2];   s[1] = xb[i2+1]; s[2] = xb[i2+2];
584:         s[3] = xb[i2+3]; s[4] = xb[i2+4]; s[5] = xb[i2+5]; s[6] = xb[i2+6];
585:         PetscKernel_v_gets_A_times_w_7(x,idiag,b);
586:         x[i2]   = xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
587:         x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5]; x[i2+6] = xw[6];
588:         i2    -= 7;
589:         idiag -= 49;
590:         for (i=m-2; i>=0; i--) {
591:           v  = aa + 49*(diag[i]+1);
592:           vi = aj + diag[i] + 1;
593:           nz = ai[i+1] - diag[i] - 1;
594:           s[0] = xb[i2];   s[1] = xb[i2+1]; s[2] = xb[i2+2];
595:           s[3] = xb[i2+3]; s[4] = xb[i2+4]; s[5] = xb[i2+5]; s[6] = xb[i2+6];
596:           while (nz--) {
597:             idx = 7*(*vi++);
598:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
599:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx]; xw[6] = x[6+idx];
600:             PetscKernel_v_gets_v_minus_A_times_w_7(s,v,xw);
601:             v  += 49;
602:           }
603:           PetscKernel_v_gets_A_times_w_7(xw,idiag,s);
604:           x[i2] =   xw[0]; x[i2+1] = xw[1]; x[i2+2] = xw[2];
605:           x[i2+3] = xw[3]; x[i2+4] = xw[4]; x[i2+5] = xw[5]; x[i2+6] = xw[6];
606:           idiag  -= 49;
607:           i2     -= 7;
608:         }
609:         break;
610:       default:
611:         PetscArraycpy(w,xb+i2,bs);
612:         PetscKernel_w_gets_Ar_times_v(bs,bs,w,idiag,x+i2);
613:         i2    -= bs;
614:         idiag -= bs2;
615:         for (i=m-2; i>=0; i--) {
616:           v  = aa + bs2*(diag[i]+1);
617:           vi = aj + diag[i] + 1;
618:           nz = ai[i+1] - diag[i] - 1;

620:           PetscArraycpy(w,xb+i2,bs);
621:           /* copy all rows of x that are needed into contiguous space */
622:           workt = work;
623:           for (j=0; j<nz; j++) {
624:             PetscArraycpy(workt,x + bs*(*vi++),bs);
625:             workt += bs;
626:           }
627:           PetscKernel_w_gets_w_minus_Ar_times_v(bs,bs*nz,w,v,work);
628:           PetscKernel_w_gets_Ar_times_v(bs,bs,w,idiag,x+i2);

630:           idiag -= bs2;
631:           i2    -= bs;
632:         }
633:         break;
634:       }
635:       PetscLogFlops(1.0*bs2*(a->nz));
636:     }
637:     its--;
638:   }
639:   while (its--) {
640:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
641:       idiag = a->idiag;
642:       i2 = 0;
643:       switch (bs) {
644:       case 1:
645:         for (i=0; i<m; i++) {
646:           v  = aa + ai[i];
647:           vi = aj + ai[i];
648:           nz = ai[i+1] - ai[i];
649:           s[0] = b[i2];
650:           for (j=0; j<nz; j++) {
651:             xw[0] = x[vi[j]];
652:             PetscKernel_v_gets_v_minus_A_times_w_1(s,(v+j),xw);
653:           }
654:           PetscKernel_v_gets_A_times_w_1(xw,idiag,s);
655:           x[i2] += xw[0];
656:           idiag += 1;
657:           i2    += 1;
658:         }
659:         break;
660:       case 2:
661:         for (i=0; i<m; i++) {
662:           v  = aa + 4*ai[i];
663:           vi = aj + ai[i];
664:           nz = ai[i+1] - ai[i];
665:           s[0] = b[i2]; s[1] = b[i2+1];
666:           for (j=0; j<nz; j++) {
667:             idx = 2*vi[j];
668:             it  = 4*j;
669:             xw[0] = x[idx]; xw[1] = x[1+idx];
670:             PetscKernel_v_gets_v_minus_A_times_w_2(s,(v+it),xw);
671:           }
672:           PetscKernel_v_gets_A_times_w_2(xw,idiag,s);
673:           x[i2]  += xw[0]; x[i2+1] += xw[1];
674:           idiag  += 4;
675:           i2     += 2;
676:         }
677:         break;
678:       case 3:
679:         for (i=0; i<m; i++) {
680:           v  = aa + 9*ai[i];
681:           vi = aj + ai[i];
682:           nz = ai[i+1] - ai[i];
683:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2];
684:           while (nz--) {
685:             idx = 3*(*vi++);
686:             xw[0] = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx];
687:             PetscKernel_v_gets_v_minus_A_times_w_3(s,v,xw);
688:             v  += 9;
689:           }
690:           PetscKernel_v_gets_A_times_w_3(xw,idiag,s);
691:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
692:           idiag  += 9;
693:           i2     += 3;
694:         }
695:         break;
696:       case 4:
697:         for (i=0; i<m; i++) {
698:           v  = aa + 16*ai[i];
699:           vi = aj + ai[i];
700:           nz = ai[i+1] - ai[i];
701:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3];
702:           while (nz--) {
703:             idx = 4*(*vi++);
704:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx];
705:             PetscKernel_v_gets_v_minus_A_times_w_4(s,v,xw);
706:             v  += 16;
707:           }
708:           PetscKernel_v_gets_A_times_w_4(xw,idiag,s);
709:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2]; x[i2+3] += xw[3];
710:           idiag  += 16;
711:           i2     += 4;
712:         }
713:         break;
714:       case 5:
715:         for (i=0; i<m; i++) {
716:           v  = aa + 25*ai[i];
717:           vi = aj + ai[i];
718:           nz = ai[i+1] - ai[i];
719:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4];
720:           while (nz--) {
721:             idx = 5*(*vi++);
722:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx]; xw[4] = x[4+idx];
723:             PetscKernel_v_gets_v_minus_A_times_w_5(s,v,xw);
724:             v  += 25;
725:           }
726:           PetscKernel_v_gets_A_times_w_5(xw,idiag,s);
727:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2]; x[i2+3] += xw[3]; x[i2+4] += xw[4];
728:           idiag  += 25;
729:           i2     += 5;
730:         }
731:         break;
732:       case 6:
733:         for (i=0; i<m; i++) {
734:           v  = aa + 36*ai[i];
735:           vi = aj + ai[i];
736:           nz = ai[i+1] - ai[i];
737:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5];
738:           while (nz--) {
739:             idx = 6*(*vi++);
740:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
741:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx];
742:             PetscKernel_v_gets_v_minus_A_times_w_6(s,v,xw);
743:             v  += 36;
744:           }
745:           PetscKernel_v_gets_A_times_w_6(xw,idiag,s);
746:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
747:           x[i2+3] += xw[3]; x[i2+4] += xw[4]; x[i2+5] += xw[5];
748:           idiag  += 36;
749:           i2     += 6;
750:         }
751:         break;
752:       case 7:
753:         for (i=0; i<m; i++) {
754:           v  = aa + 49*ai[i];
755:           vi = aj + ai[i];
756:           nz = ai[i+1] - ai[i];
757:           s[0] = b[i2];   s[1] = b[i2+1]; s[2] = b[i2+2];
758:           s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5]; s[6] = b[i2+6];
759:           while (nz--) {
760:             idx = 7*(*vi++);
761:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
762:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx]; xw[6] = x[6+idx];
763:             PetscKernel_v_gets_v_minus_A_times_w_7(s,v,xw);
764:             v  += 49;
765:           }
766:           PetscKernel_v_gets_A_times_w_7(xw,idiag,s);
767:           x[i2]   += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
768:           x[i2+3] += xw[3]; x[i2+4] += xw[4]; x[i2+5] += xw[5]; x[i2+6] += xw[6];
769:           idiag  += 49;
770:           i2     += 7;
771:         }
772:         break;
773:       default:
774:         for (i=0; i<m; i++) {
775:           v  = aa + bs2*ai[i];
776:           vi = aj + ai[i];
777:           nz = ai[i+1] - ai[i];

779:           PetscArraycpy(w,b+i2,bs);
780:           /* copy all rows of x that are needed into contiguous space */
781:           workt = work;
782:           for (j=0; j<nz; j++) {
783:             PetscArraycpy(workt,x + bs*(*vi++),bs);
784:             workt += bs;
785:           }
786:           PetscKernel_w_gets_w_minus_Ar_times_v(bs,bs*nz,w,v,work);
787:           PetscKernel_w_gets_w_plus_Ar_times_v(bs,bs,w,idiag,x+i2);

789:           idiag += bs2;
790:           i2    += bs;
791:         }
792:         break;
793:       }
794:       PetscLogFlops(2.0*bs2*a->nz);
795:     }
796:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
797:       idiag = a->idiag+bs2*(a->mbs-1);
798:       i2 = bs * (m-1);
799:       switch (bs) {
800:       case 1:
801:         for (i=m-1; i>=0; i--) {
802:           v  = aa + ai[i];
803:           vi = aj + ai[i];
804:           nz = ai[i+1] - ai[i];
805:           s[0] = b[i2];
806:           for (j=0; j<nz; j++) {
807:             xw[0] = x[vi[j]];
808:             PetscKernel_v_gets_v_minus_A_times_w_1(s,(v+j),xw);
809:           }
810:           PetscKernel_v_gets_A_times_w_1(xw,idiag,s);
811:           x[i2] += xw[0];
812:           idiag -= 1;
813:           i2    -= 1;
814:         }
815:         break;
816:       case 2:
817:         for (i=m-1; i>=0; i--) {
818:           v  = aa + 4*ai[i];
819:           vi = aj + ai[i];
820:           nz = ai[i+1] - ai[i];
821:           s[0] = b[i2]; s[1] = b[i2+1];
822:           for (j=0; j<nz; j++) {
823:             idx = 2*vi[j];
824:             it  = 4*j;
825:             xw[0] = x[idx]; xw[1] = x[1+idx];
826:             PetscKernel_v_gets_v_minus_A_times_w_2(s,(v+it),xw);
827:           }
828:           PetscKernel_v_gets_A_times_w_2(xw,idiag,s);
829:           x[i2]  += xw[0]; x[i2+1] += xw[1];
830:           idiag  -= 4;
831:           i2     -= 2;
832:         }
833:         break;
834:       case 3:
835:         for (i=m-1; i>=0; i--) {
836:           v  = aa + 9*ai[i];
837:           vi = aj + ai[i];
838:           nz = ai[i+1] - ai[i];
839:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2];
840:           while (nz--) {
841:             idx = 3*(*vi++);
842:             xw[0] = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx];
843:             PetscKernel_v_gets_v_minus_A_times_w_3(s,v,xw);
844:             v  += 9;
845:           }
846:           PetscKernel_v_gets_A_times_w_3(xw,idiag,s);
847:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
848:           idiag  -= 9;
849:           i2     -= 3;
850:         }
851:         break;
852:       case 4:
853:         for (i=m-1; i>=0; i--) {
854:           v  = aa + 16*ai[i];
855:           vi = aj + ai[i];
856:           nz = ai[i+1] - ai[i];
857:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3];
858:           while (nz--) {
859:             idx = 4*(*vi++);
860:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx];
861:             PetscKernel_v_gets_v_minus_A_times_w_4(s,v,xw);
862:             v  += 16;
863:           }
864:           PetscKernel_v_gets_A_times_w_4(xw,idiag,s);
865:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2]; x[i2+3] += xw[3];
866:           idiag  -= 16;
867:           i2     -= 4;
868:         }
869:         break;
870:       case 5:
871:         for (i=m-1; i>=0; i--) {
872:           v  = aa + 25*ai[i];
873:           vi = aj + ai[i];
874:           nz = ai[i+1] - ai[i];
875:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4];
876:           while (nz--) {
877:             idx = 5*(*vi++);
878:             xw[0]  = x[idx]; xw[1] = x[1+idx]; xw[2] = x[2+idx]; xw[3] = x[3+idx]; xw[4] = x[4+idx];
879:             PetscKernel_v_gets_v_minus_A_times_w_5(s,v,xw);
880:             v  += 25;
881:           }
882:           PetscKernel_v_gets_A_times_w_5(xw,idiag,s);
883:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2]; x[i2+3] += xw[3]; x[i2+4] += xw[4];
884:           idiag  -= 25;
885:           i2     -= 5;
886:         }
887:         break;
888:       case 6:
889:         for (i=m-1; i>=0; i--) {
890:           v  = aa + 36*ai[i];
891:           vi = aj + ai[i];
892:           nz = ai[i+1] - ai[i];
893:           s[0] = b[i2]; s[1] = b[i2+1]; s[2] = b[i2+2]; s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5];
894:           while (nz--) {
895:             idx = 6*(*vi++);
896:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
897:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx];
898:             PetscKernel_v_gets_v_minus_A_times_w_6(s,v,xw);
899:             v  += 36;
900:           }
901:           PetscKernel_v_gets_A_times_w_6(xw,idiag,s);
902:           x[i2] += xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
903:           x[i2+3] += xw[3]; x[i2+4] += xw[4]; x[i2+5] += xw[5];
904:           idiag  -= 36;
905:           i2     -= 6;
906:         }
907:         break;
908:       case 7:
909:         for (i=m-1; i>=0; i--) {
910:           v  = aa + 49*ai[i];
911:           vi = aj + ai[i];
912:           nz = ai[i+1] - ai[i];
913:           s[0] = b[i2];   s[1] = b[i2+1]; s[2] = b[i2+2];
914:           s[3] = b[i2+3]; s[4] = b[i2+4]; s[5] = b[i2+5]; s[6] = b[i2+6];
915:           while (nz--) {
916:             idx = 7*(*vi++);
917:             xw[0] = x[idx];   xw[1] = x[1+idx]; xw[2] = x[2+idx];
918:             xw[3] = x[3+idx]; xw[4] = x[4+idx]; xw[5] = x[5+idx]; xw[6] = x[6+idx];
919:             PetscKernel_v_gets_v_minus_A_times_w_7(s,v,xw);
920:             v  += 49;
921:           }
922:           PetscKernel_v_gets_A_times_w_7(xw,idiag,s);
923:           x[i2] +=   xw[0]; x[i2+1] += xw[1]; x[i2+2] += xw[2];
924:           x[i2+3] += xw[3]; x[i2+4] += xw[4]; x[i2+5] += xw[5]; x[i2+6] += xw[6];
925:           idiag  -= 49;
926:           i2     -= 7;
927:         }
928:         break;
929:       default:
930:         for (i=m-1; i>=0; i--) {
931:           v  = aa + bs2*ai[i];
932:           vi = aj + ai[i];
933:           nz = ai[i+1] - ai[i];

935:           PetscArraycpy(w,b+i2,bs);
936:           /* copy all rows of x that are needed into contiguous space */
937:           workt = work;
938:           for (j=0; j<nz; j++) {
939:             PetscArraycpy(workt,x + bs*(*vi++),bs);
940:             workt += bs;
941:           }
942:           PetscKernel_w_gets_w_minus_Ar_times_v(bs,bs*nz,w,v,work);
943:           PetscKernel_w_gets_w_plus_Ar_times_v(bs,bs,w,idiag,x+i2);

945:           idiag -= bs2;
946:           i2    -= bs;
947:         }
948:         break;
949:       }
950:       PetscLogFlops(2.0*bs2*(a->nz));
951:     }
952:   }
953:   VecRestoreArray(xx,&x);
954:   VecRestoreArrayRead(bb,&b);
955:   return(0);
956: }

958: /*
959:     Special version for direct calls from Fortran (Used in PETSc-fun3d)
960: */
961: #if defined(PETSC_HAVE_FORTRAN_CAPS)
962: #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
963: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
964: #define matsetvaluesblocked4_ matsetvaluesblocked4
965: #endif

967: PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA,PetscInt *mm,const PetscInt im[],PetscInt *nn,const PetscInt in[],const PetscScalar v[])
968: {
969:   Mat               A  = *AA;
970:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
971:   PetscInt          *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,N,m = *mm,n = *nn;
972:   PetscInt          *ai    =a->i,*ailen=a->ilen;
973:   PetscInt          *aj    =a->j,stepval,lastcol = -1;
974:   const PetscScalar *value = v;
975:   MatScalar         *ap,*aa = a->a,*bap;
976:   PetscErrorCode    ierr;

979:   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Can only be called with a block size of 4");
980:   stepval = (n-1)*4;
981:   for (k=0; k<m; k++) { /* loop over added rows */
982:     row  = im[k];
983:     rp   = aj + ai[row];
984:     ap   = aa + 16*ai[row];
985:     nrow = ailen[row];
986:     low  = 0;
987:     high = nrow;
988:     for (l=0; l<n; l++) { /* loop over added columns */
989:       col = in[l];
990:       if (col <= lastcol)  low = 0;
991:       else                high = nrow;
992:       lastcol = col;
993:       value   = v + k*(stepval+4 + l)*4;
994:       while (high-low > 7) {
995:         t = (low+high)/2;
996:         if (rp[t] > col) high = t;
997:         else             low  = t;
998:       }
999:       for (i=low; i<high; i++) {
1000:         if (rp[i] > col) break;
1001:         if (rp[i] == col) {
1002:           bap = ap +  16*i;
1003:           for (ii=0; ii<4; ii++,value+=stepval) {
1004:             for (jj=ii; jj<16; jj+=4) {
1005:               bap[jj] += *value++;
1006:             }
1007:           }
1008:           goto noinsert2;
1009:         }
1010:       }
1011:       N = nrow++ - 1;
1012:       high++; /* added new column index thus must search to one higher than before */
1013:       /* shift up all the later entries in this row */
1014:       for (ii=N; ii>=i; ii--) {
1015:         rp[ii+1] = rp[ii];
1016:         PetscArraycpy(ap+16*(ii+1),ap+16*(ii),16);CHKERRV(ierr);
1017:       }
1018:       if (N >= i) {
1019:         PetscArrayzero(ap+16*i,16);CHKERRV(ierr);
1020:       }
1021:       rp[i] = col;
1022:       bap   = ap +  16*i;
1023:       for (ii=0; ii<4; ii++,value+=stepval) {
1024:         for (jj=ii; jj<16; jj+=4) {
1025:           bap[jj] = *value++;
1026:         }
1027:       }
1028:       noinsert2:;
1029:       low = i;
1030:     }
1031:     ailen[row] = nrow;
1032:   }
1033:   PetscFunctionReturnVoid();
1034: }

1036: #if defined(PETSC_HAVE_FORTRAN_CAPS)
1037: #define matsetvalues4_ MATSETVALUES4
1038: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1039: #define matsetvalues4_ matsetvalues4
1040: #endif

1042: PETSC_EXTERN void matsetvalues4_(Mat *AA,PetscInt *mm,PetscInt *im,PetscInt *nn,PetscInt *in,PetscScalar *v)
1043: {
1044:   Mat         A  = *AA;
1045:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1046:   PetscInt    *rp,k,low,high,t,row,nrow,i,col,l,N,n = *nn,m = *mm;
1047:   PetscInt    *ai=a->i,*ailen=a->ilen;
1048:   PetscInt    *aj=a->j,brow,bcol;
1049:   PetscInt    ridx,cidx,lastcol = -1;
1050:   MatScalar   *ap,value,*aa=a->a,*bap;

1054:   for (k=0; k<m; k++) { /* loop over added rows */
1055:     row  = im[k]; brow = row/4;
1056:     rp   = aj + ai[brow];
1057:     ap   = aa + 16*ai[brow];
1058:     nrow = ailen[brow];
1059:     low  = 0;
1060:     high = nrow;
1061:     for (l=0; l<n; l++) { /* loop over added columns */
1062:       col   = in[l]; bcol = col/4;
1063:       ridx  = row % 4; cidx = col % 4;
1064:       value = v[l + k*n];
1065:       if (col <= lastcol)  low = 0;
1066:       else                high = nrow;
1067:       lastcol = col;
1068:       while (high-low > 7) {
1069:         t = (low+high)/2;
1070:         if (rp[t] > bcol) high = t;
1071:         else              low  = t;
1072:       }
1073:       for (i=low; i<high; i++) {
1074:         if (rp[i] > bcol) break;
1075:         if (rp[i] == bcol) {
1076:           bap   = ap +  16*i + 4*cidx + ridx;
1077:           *bap += value;
1078:           goto noinsert1;
1079:         }
1080:       }
1081:       N = nrow++ - 1;
1082:       high++; /* added new column thus must search to one higher than before */
1083:       /* shift up all the later entries in this row */
1084:       PetscArraymove(rp+i+1,rp+i,N-i+1);CHKERRV(ierr);
1085:       PetscArraymove(ap+16*i+16,ap+16*i,16*(N-i+1));CHKERRV(ierr);
1086:       PetscArrayzero(ap+16*i,16);CHKERRV(ierr);
1087:       rp[i]                    = bcol;
1088:       ap[16*i + 4*cidx + ridx] = value;
1089: noinsert1:;
1090:       low = i;
1091:     }
1092:     ailen[brow] = nrow;
1093:   }
1094:   PetscFunctionReturnVoid();
1095: }

1097: /*
1098:      Checks for missing diagonals
1099: */
1100: PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A,PetscBool  *missing,PetscInt *d)
1101: {
1102:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1104:   PetscInt       *diag,*ii = a->i,i;

1107:   MatMarkDiagonal_SeqBAIJ(A);
1108:   *missing = PETSC_FALSE;
1109:   if (A->rmap->n > 0 && !ii) {
1110:     *missing = PETSC_TRUE;
1111:     if (d) *d = 0;
1112:     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
1113:   } else {
1114:     PetscInt n;
1115:     n = PetscMin(a->mbs, a->nbs);
1116:     diag = a->diag;
1117:     for (i=0; i<n; i++) {
1118:       if (diag[i] >= ii[i+1]) {
1119:         *missing = PETSC_TRUE;
1120:         if (d) *d = i;
1121:         PetscInfo1(A,"Matrix is missing block diagonal number %D\n",i);
1122:         break;
1123:       }
1124:     }
1125:   }
1126:   return(0);
1127: }

1129: PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A)
1130: {
1131:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1133:   PetscInt       i,j,m = a->mbs;

1136:   if (!a->diag) {
1137:     PetscMalloc1(m,&a->diag);
1138:     PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));
1139:     a->free_diag = PETSC_TRUE;
1140:   }
1141:   for (i=0; i<m; i++) {
1142:     a->diag[i] = a->i[i+1];
1143:     for (j=a->i[i]; j<a->i[i+1]; j++) {
1144:       if (a->j[j] == i) {
1145:         a->diag[i] = j;
1146:         break;
1147:       }
1148:     }
1149:   }
1150:   return(0);
1151: }

1153: static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *nn,const PetscInt *inia[],const PetscInt *inja[],PetscBool  *done)
1154: {
1155:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1157:   PetscInt       i,j,n = a->mbs,nz = a->i[n],*tia,*tja,bs = A->rmap->bs,k,l,cnt;
1158:   PetscInt       **ia = (PetscInt**)inia,**ja = (PetscInt**)inja;

1161:   *nn = n;
1162:   if (!ia) return(0);
1163:   if (symmetric) {
1164:     MatToSymmetricIJ_SeqAIJ(n,a->i,a->j,PETSC_TRUE,0,0,&tia,&tja);
1165:     nz   = tia[n];
1166:   } else {
1167:     tia = a->i; tja = a->j;
1168:   }

1170:   if (!blockcompressed && bs > 1) {
1171:     (*nn) *= bs;
1172:     /* malloc & create the natural set of indices */
1173:     PetscMalloc1((n+1)*bs,ia);
1174:     if (n) {
1175:       (*ia)[0] = oshift;
1176:       for (j=1; j<bs; j++) {
1177:         (*ia)[j] = (tia[1]-tia[0])*bs+(*ia)[j-1];
1178:       }
1179:     }

1181:     for (i=1; i<n; i++) {
1182:       (*ia)[i*bs] = (tia[i]-tia[i-1])*bs + (*ia)[i*bs-1];
1183:       for (j=1; j<bs; j++) {
1184:         (*ia)[i*bs+j] = (tia[i+1]-tia[i])*bs + (*ia)[i*bs+j-1];
1185:       }
1186:     }
1187:     if (n) {
1188:       (*ia)[n*bs] = (tia[n]-tia[n-1])*bs + (*ia)[n*bs-1];
1189:     }

1191:     if (inja) {
1192:       PetscMalloc1(nz*bs*bs,ja);
1193:       cnt = 0;
1194:       for (i=0; i<n; i++) {
1195:         for (j=0; j<bs; j++) {
1196:           for (k=tia[i]; k<tia[i+1]; k++) {
1197:             for (l=0; l<bs; l++) {
1198:               (*ja)[cnt++] = bs*tja[k] + l;
1199:             }
1200:           }
1201:         }
1202:       }
1203:     }

1205:     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1206:       PetscFree(tia);
1207:       PetscFree(tja);
1208:     }
1209:   } else if (oshift == 1) {
1210:     if (symmetric) {
1211:       nz = tia[A->rmap->n/bs];
1212:       /*  add 1 to i and j indices */
1213:       for (i=0; i<A->rmap->n/bs+1; i++) tia[i] = tia[i] + 1;
1214:       *ia = tia;
1215:       if (ja) {
1216:         for (i=0; i<nz; i++) tja[i] = tja[i] + 1;
1217:         *ja = tja;
1218:       }
1219:     } else {
1220:       nz = a->i[A->rmap->n/bs];
1221:       /* malloc space and  add 1 to i and j indices */
1222:       PetscMalloc1(A->rmap->n/bs+1,ia);
1223:       for (i=0; i<A->rmap->n/bs+1; i++) (*ia)[i] = a->i[i] + 1;
1224:       if (ja) {
1225:         PetscMalloc1(nz,ja);
1226:         for (i=0; i<nz; i++) (*ja)[i] = a->j[i] + 1;
1227:       }
1228:     }
1229:   } else {
1230:     *ia = tia;
1231:     if (ja) *ja = tja;
1232:   }
1233:   return(0);
1234: }

1236: static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
1237: {

1241:   if (!ia) return(0);
1242:   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1243:     PetscFree(*ia);
1244:     if (ja) {PetscFree(*ja);}
1245:   }
1246:   return(0);
1247: }

1249: PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1250: {
1251:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1255: #if defined(PETSC_USE_LOG)
1256:   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->N,A->cmap->n,a->nz);
1257: #endif
1258:   MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
1259:   ISDestroy(&a->row);
1260:   ISDestroy(&a->col);
1261:   if (a->free_diag) {PetscFree(a->diag);}
1262:   PetscFree(a->idiag);
1263:   if (a->free_imax_ilen) {PetscFree2(a->imax,a->ilen);}
1264:   PetscFree(a->solve_work);
1265:   PetscFree(a->mult_work);
1266:   PetscFree(a->sor_workt);
1267:   PetscFree(a->sor_work);
1268:   ISDestroy(&a->icol);
1269:   PetscFree(a->saved_values);
1270:   PetscFree2(a->compressedrow.i,a->compressedrow.rindex);

1272:   MatDestroy(&a->sbaijMat);
1273:   MatDestroy(&a->parent);
1274:   PetscFree(A->data);

1276:   PetscObjectChangeTypeName((PetscObject)A,NULL);
1277:   PetscObjectComposeFunction((PetscObject)A,"MatSeqBAIJGetArray_C",NULL);
1278:   PetscObjectComposeFunction((PetscObject)A,"MatSeqBAIJRestoreArray_C",NULL);
1279:   PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);
1280:   PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);
1281:   PetscObjectComposeFunction((PetscObject)A,"MatSeqBAIJSetColumnIndices_C",NULL);
1282:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_seqaij_C",NULL);
1283:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_seqsbaij_C",NULL);
1284:   PetscObjectComposeFunction((PetscObject)A,"MatSeqBAIJSetPreallocation_C",NULL);
1285:   PetscObjectComposeFunction((PetscObject)A,"MatSeqBAIJSetPreallocationCSR_C",NULL);
1286:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_seqbstrm_C",NULL);
1287:   PetscObjectComposeFunction((PetscObject)A,"MatIsTranspose_C",NULL);
1288: #if defined(PETSC_HAVE_HYPRE)
1289:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_hypre_C",NULL);
1290: #endif
1291:   PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqbaij_is_C",NULL);
1292:   return(0);
1293: }

1295: PetscErrorCode MatSetOption_SeqBAIJ(Mat A,MatOption op,PetscBool flg)
1296: {
1297:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1301:   switch (op) {
1302:   case MAT_ROW_ORIENTED:
1303:     a->roworiented = flg;
1304:     break;
1305:   case MAT_KEEP_NONZERO_PATTERN:
1306:     a->keepnonzeropattern = flg;
1307:     break;
1308:   case MAT_NEW_NONZERO_LOCATIONS:
1309:     a->nonew = (flg ? 0 : 1);
1310:     break;
1311:   case MAT_NEW_NONZERO_LOCATION_ERR:
1312:     a->nonew = (flg ? -1 : 0);
1313:     break;
1314:   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1315:     a->nonew = (flg ? -2 : 0);
1316:     break;
1317:   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1318:     a->nounused = (flg ? -1 : 0);
1319:     break;
1320:   case MAT_FORCE_DIAGONAL_ENTRIES:
1321:   case MAT_IGNORE_OFF_PROC_ENTRIES:
1322:   case MAT_USE_HASH_TABLE:
1323:   case MAT_SORTED_FULL:
1324:     PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);
1325:     break;
1326:   case MAT_SPD:
1327:   case MAT_SYMMETRIC:
1328:   case MAT_STRUCTURALLY_SYMMETRIC:
1329:   case MAT_HERMITIAN:
1330:   case MAT_SYMMETRY_ETERNAL:
1331:   case MAT_SUBMAT_SINGLEIS:
1332:   case MAT_STRUCTURE_ONLY:
1333:     /* These options are handled directly by MatSetOption() */
1334:     break;
1335:   default:
1336:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
1337:   }
1338:   return(0);
1339: }

1341: /* used for both SeqBAIJ and SeqSBAIJ matrices */
1342: PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v,PetscInt *ai,PetscInt *aj,PetscScalar *aa)
1343: {
1345:   PetscInt       itmp,i,j,k,M,bn,bp,*idx_i,bs,bs2;
1346:   MatScalar      *aa_i;
1347:   PetscScalar    *v_i;

1350:   bs  = A->rmap->bs;
1351:   bs2 = bs*bs;
1352:   if (row < 0 || row >= A->rmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range", row);

1354:   bn  = row/bs;   /* Block number */
1355:   bp  = row % bs; /* Block Position */
1356:   M   = ai[bn+1] - ai[bn];
1357:   *nz = bs*M;

1359:   if (v) {
1360:     *v = NULL;
1361:     if (*nz) {
1362:       PetscMalloc1(*nz,v);
1363:       for (i=0; i<M; i++) { /* for each block in the block row */
1364:         v_i  = *v + i*bs;
1365:         aa_i = aa + bs2*(ai[bn] + i);
1366:         for (j=bp,k=0; j<bs2; j+=bs,k++) v_i[k] = aa_i[j];
1367:       }
1368:     }
1369:   }

1371:   if (idx) {
1372:     *idx = NULL;
1373:     if (*nz) {
1374:       PetscMalloc1(*nz,idx);
1375:       for (i=0; i<M; i++) { /* for each block in the block row */
1376:         idx_i = *idx + i*bs;
1377:         itmp  = bs*aj[ai[bn] + i];
1378:         for (j=0; j<bs; j++) idx_i[j] = itmp++;
1379:       }
1380:     }
1381:   }
1382:   return(0);
1383: }

1385: PetscErrorCode MatGetRow_SeqBAIJ(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
1386: {
1387:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

1391:   MatGetRow_SeqBAIJ_private(A,row,nz,idx,v,a->i,a->j,a->a);
1392:   return(0);
1393: }

1395: PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
1396: {

1400:   if (nz)  *nz = 0;
1401:   if (idx) {PetscFree(*idx);}
1402:   if (v)   {PetscFree(*v);}
1403:   return(0);
1404: }

1406: PetscErrorCode MatTranspose_SeqBAIJ(Mat A,MatReuse reuse,Mat *B)
1407: {
1408:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*at;
1409:   Mat            C;
1411:   PetscInt       i,j,k,*aj=a->j,*ai=a->i,bs=A->rmap->bs,mbs=a->mbs,nbs=a->nbs,*atfill;
1412:   PetscInt       bs2=a->bs2,*ati,*atj,anzj,kr;
1413:   MatScalar      *ata,*aa=a->a;

1416:   PetscCalloc1(1+nbs,&atfill);
1417:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1418:     for (i=0; i<ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */

1420:     MatCreate(PetscObjectComm((PetscObject)A),&C);
1421:     MatSetSizes(C,A->cmap->n,A->rmap->N,A->cmap->n,A->rmap->N);
1422:     MatSetType(C,((PetscObject)A)->type_name);
1423:     MatSeqBAIJSetPreallocation(C,bs,0,atfill);

1425:     at  = (Mat_SeqBAIJ*)C->data;
1426:     ati = at->i;
1427:     for (i=0; i<nbs; i++) at->ilen[i] = at->imax[i] = ati[i+1] - ati[i];
1428:   } else {
1429:     C = *B;
1430:     at = (Mat_SeqBAIJ*)C->data;
1431:     ati = at->i;
1432:   }

1434:   atj = at->j;
1435:   ata = at->a;

1437:   /* Copy ati into atfill so we have locations of the next free space in atj */
1438:   PetscArraycpy(atfill,ati,nbs);

1440:   /* Walk through A row-wise and mark nonzero entries of A^T. */
1441:   for (i=0; i<mbs; i++) {
1442:     anzj = ai[i+1] - ai[i];
1443:     for (j=0; j<anzj; j++) {
1444:       atj[atfill[*aj]] = i;
1445:       for (kr=0; kr<bs; kr++) {
1446:         for (k=0; k<bs; k++) {
1447:           ata[bs2*atfill[*aj]+k*bs+kr] = *aa++;
1448:         }
1449:       }
1450:       atfill[*aj++] += 1;
1451:     }
1452:   }
1453:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1454:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1456:   /* Clean up temporary space and complete requests. */
1457:   PetscFree(atfill);

1459:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1460:     MatSetBlockSizes(C,PetscAbs(A->cmap->bs),PetscAbs(A->rmap->bs));
1461:     *B = C;
1462:   } else {
1463:     MatHeaderMerge(A,&C);
1464:   }
1465:   return(0);
1466: }

1468: PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A,Mat B,PetscReal tol,PetscBool  *f)
1469: {
1471:   Mat            Btrans;

1474:   *f   = PETSC_FALSE;
1475:   MatTranspose_SeqBAIJ(A,MAT_INITIAL_MATRIX,&Btrans);
1476:   MatEqual_SeqBAIJ(B,Btrans,f);
1477:   MatDestroy(&Btrans);
1478:   return(0);
1479: }

1481: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1482: PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat,PetscViewer viewer)
1483: {
1484:   Mat_SeqBAIJ    *A = (Mat_SeqBAIJ*)mat->data;
1485:   PetscInt       header[4],M,N,m,bs,nz,cnt,i,j,k,l;
1486:   PetscInt       *rowlens,*colidxs;
1487:   PetscScalar    *matvals;

1491:   PetscViewerSetUp(viewer);

1493:   M  = mat->rmap->N;
1494:   N  = mat->cmap->N;
1495:   m  = mat->rmap->n;
1496:   bs = mat->rmap->bs;
1497:   nz = bs*bs*A->nz;

1499:   /* write matrix header */
1500:   header[0] = MAT_FILE_CLASSID;
1501:   header[1] = M; header[2] = N; header[3] = nz;
1502:   PetscViewerBinaryWrite(viewer,header,4,PETSC_INT);

1504:   /* store row lengths */
1505:   PetscMalloc1(m,&rowlens);
1506:   for (cnt=0, i=0; i<A->mbs; i++)
1507:     for (j=0; j<bs; j++)
1508:       rowlens[cnt++] = bs*(A->i[i+1] - A->i[i]);
1509:   PetscViewerBinaryWrite(viewer,rowlens,m,PETSC_INT);
1510:   PetscFree(rowlens);

1512:   /* store column indices  */
1513:   PetscMalloc1(nz,&colidxs);
1514:   for (cnt=0, i=0; i<A->mbs; i++)
1515:     for (k=0; k<bs; k++)
1516:       for (j=A->i[i]; j<A->i[i+1]; j++)
1517:         for (l=0; l<bs; l++)
1518:           colidxs[cnt++] = bs*A->j[j] + l;
1519:   if (cnt != nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_LIB,"Internal PETSc error: cnt = %D nz = %D",cnt,nz);
1520:   PetscViewerBinaryWrite(viewer,colidxs,nz,PETSC_INT);
1521:   PetscFree(colidxs);

1523:   /* store nonzero values */
1524:   PetscMalloc1(nz,&matvals);
1525:   for (cnt=0, i=0; i<A->mbs; i++)
1526:     for (k=0; k<bs; k++)
1527:       for (j=A->i[i]; j<A->i[i+1]; j++)
1528:         for (l=0; l<bs; l++)
1529:           matvals[cnt++] = A->a[bs*(bs*j + l) + k];
1530:   if (cnt != nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_LIB,"Internal PETSc error: cnt = %D nz = %D",cnt,nz);
1531:   PetscViewerBinaryWrite(viewer,matvals,nz,PETSC_SCALAR);
1532:   PetscFree(matvals);

1534:   /* write block size option to the viewer's .info file */
1535:   MatView_Binary_BlockSizes(mat,viewer);
1536:   return(0);
1537: }

1539: static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A,PetscViewer viewer)
1540: {
1542:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1543:   PetscInt       i,bs = A->rmap->bs,k;

1546:   PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1547:   for (i=0; i<a->mbs; i++) {
1548:     PetscViewerASCIIPrintf(viewer,"row %D-%D:",i*bs,i*bs+bs-1);
1549:     for (k=a->i[i]; k<a->i[i+1]; k++) {
1550:       PetscViewerASCIIPrintf(viewer," (%D-%D) ",bs*a->j[k],bs*a->j[k]+bs-1);
1551:     }
1552:     PetscViewerASCIIPrintf(viewer,"\n");
1553:   }
1554:   PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1555:   return(0);
1556: }

1558: static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A,PetscViewer viewer)
1559: {
1560:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1561:   PetscErrorCode    ierr;
1562:   PetscInt          i,j,bs = A->rmap->bs,k,l,bs2=a->bs2;
1563:   PetscViewerFormat format;

1566:   if (A->structure_only) {
1567:     MatView_SeqBAIJ_ASCII_structonly(A,viewer);
1568:     return(0);
1569:   }

1571:   PetscViewerGetFormat(viewer,&format);
1572:   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1573:     PetscViewerASCIIPrintf(viewer,"  block size is %D\n",bs);
1574:   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1575:     const char *matname;
1576:     Mat        aij;
1577:     MatConvert(A,MATSEQAIJ,MAT_INITIAL_MATRIX,&aij);
1578:     PetscObjectGetName((PetscObject)A,&matname);
1579:     PetscObjectSetName((PetscObject)aij,matname);
1580:     MatView(aij,viewer);
1581:     MatDestroy(&aij);
1582:   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1583:       return(0);
1584:   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1585:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1586:     for (i=0; i<a->mbs; i++) {
1587:       for (j=0; j<bs; j++) {
1588:         PetscViewerASCIIPrintf(viewer,"row %D:",i*bs+j);
1589:         for (k=a->i[i]; k<a->i[i+1]; k++) {
1590:           for (l=0; l<bs; l++) {
1591: #if defined(PETSC_USE_COMPLEX)
1592:             if (PetscImaginaryPart(a->a[bs2*k + l*bs + j]) > 0.0 && PetscRealPart(a->a[bs2*k + l*bs + j]) != 0.0) {
1593:               PetscViewerASCIIPrintf(viewer," (%D, %g + %gi) ",bs*a->j[k]+l,
1594:                                             (double)PetscRealPart(a->a[bs2*k + l*bs + j]),(double)PetscImaginaryPart(a->a[bs2*k + l*bs + j]));
1595:             } else if (PetscImaginaryPart(a->a[bs2*k + l*bs + j]) < 0.0 && PetscRealPart(a->a[bs2*k + l*bs + j]) != 0.0) {
1596:               PetscViewerASCIIPrintf(viewer," (%D, %g - %gi) ",bs*a->j[k]+l,
1597:                                             (double)PetscRealPart(a->a[bs2*k + l*bs + j]),-(double)PetscImaginaryPart(a->a[bs2*k + l*bs + j]));
1598:             } else if (PetscRealPart(a->a[bs2*k + l*bs + j]) != 0.0) {
1599:               PetscViewerASCIIPrintf(viewer," (%D, %g) ",bs*a->j[k]+l,(double)PetscRealPart(a->a[bs2*k + l*bs + j]));
1600:             }
1601: #else
1602:             if (a->a[bs2*k + l*bs + j] != 0.0) {
1603:               PetscViewerASCIIPrintf(viewer," (%D, %g) ",bs*a->j[k]+l,(double)a->a[bs2*k + l*bs + j]);
1604:             }
1605: #endif
1606:           }
1607:         }
1608:         PetscViewerASCIIPrintf(viewer,"\n");
1609:       }
1610:     }
1611:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1612:   } else {
1613:     PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1614:     for (i=0; i<a->mbs; i++) {
1615:       for (j=0; j<bs; j++) {
1616:         PetscViewerASCIIPrintf(viewer,"row %D:",i*bs+j);
1617:         for (k=a->i[i]; k<a->i[i+1]; k++) {
1618:           for (l=0; l<bs; l++) {
1619: #if defined(PETSC_USE_COMPLEX)
1620:             if (PetscImaginaryPart(a->a[bs2*k + l*bs + j]) > 0.0) {
1621:               PetscViewerASCIIPrintf(viewer," (%D, %g + %g i) ",bs*a->j[k]+l,
1622:                                             (double)PetscRealPart(a->a[bs2*k + l*bs + j]),(double)PetscImaginaryPart(a->a[bs2*k + l*bs + j]));
1623:             } else if (PetscImaginaryPart(a->a[bs2*k + l*bs + j]) < 0.0) {
1624:               PetscViewerASCIIPrintf(viewer," (%D, %g - %g i) ",bs*a->j[k]+l,
1625:                                             (double)PetscRealPart(a->a[bs2*k + l*bs + j]),-(double)PetscImaginaryPart(a->a[bs2*k + l*bs + j]));
1626:             } else {
1627:               PetscViewerASCIIPrintf(viewer," (%D, %g) ",bs*a->j[k]+l,(double)PetscRealPart(a->a[bs2*k + l*bs + j]));
1628:             }
1629: #else
1630:             PetscViewerASCIIPrintf(viewer," (%D, %g) ",bs*a->j[k]+l,(double)a->a[bs2*k + l*bs + j]);
1631: #endif
1632:           }
1633:         }
1634:         PetscViewerASCIIPrintf(viewer,"\n");
1635:       }
1636:     }
1637:     PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1638:   }
1639:   PetscViewerFlush(viewer);
1640:   return(0);
1641: }

1643: #include <petscdraw.h>
1644: static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw,void *Aa)
1645: {
1646:   Mat               A = (Mat) Aa;
1647:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ*)A->data;
1648:   PetscErrorCode    ierr;
1649:   PetscInt          row,i,j,k,l,mbs=a->mbs,color,bs=A->rmap->bs,bs2=a->bs2;
1650:   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1651:   MatScalar         *aa;
1652:   PetscViewer       viewer;
1653:   PetscViewerFormat format;

1656:   PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);
1657:   PetscViewerGetFormat(viewer,&format);
1658:   PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);

1660:   /* loop over matrix elements drawing boxes */

1662:   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1663:     PetscDrawCollectiveBegin(draw);
1664:     /* Blue for negative, Cyan for zero and  Red for positive */
1665:     color = PETSC_DRAW_BLUE;
1666:     for (i=0,row=0; i<mbs; i++,row+=bs) {
1667:       for (j=a->i[i]; j<a->i[i+1]; j++) {
1668:         y_l = A->rmap->N - row - 1.0; y_r = y_l + 1.0;
1669:         x_l = a->j[j]*bs; x_r = x_l + 1.0;
1670:         aa  = a->a + j*bs2;
1671:         for (k=0; k<bs; k++) {
1672:           for (l=0; l<bs; l++) {
1673:             if (PetscRealPart(*aa++) >=  0.) continue;
1674:             PetscDrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
1675:           }
1676:         }
1677:       }
1678:     }
1679:     color = PETSC_DRAW_CYAN;
1680:     for (i=0,row=0; i<mbs; i++,row+=bs) {
1681:       for (j=a->i[i]; j<a->i[i+1]; j++) {
1682:         y_l = A->rmap->N - row - 1.0; y_r = y_l + 1.0;
1683:         x_l = a->j[j]*bs; x_r = x_l + 1.0;
1684:         aa  = a->a + j*bs2;
1685:         for (k=0; k<bs; k++) {
1686:           for (l=0; l<bs; l++) {
1687:             if (PetscRealPart(*aa++) != 0.) continue;
1688:             PetscDrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
1689:           }
1690:         }
1691:       }
1692:     }
1693:     color = PETSC_DRAW_RED;
1694:     for (i=0,row=0; i<mbs; i++,row+=bs) {
1695:       for (j=a->i[i]; j<a->i[i+1]; j++) {
1696:         y_l = A->rmap->N - row - 1.0; y_r = y_l + 1.0;
1697:         x_l = a->j[j]*bs; x_r = x_l + 1.0;
1698:         aa  = a->a + j*bs2;
1699:         for (k=0; k<bs; k++) {
1700:           for (l=0; l<bs; l++) {
1701:             if (PetscRealPart(*aa++) <= 0.) continue;
1702:             PetscDrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
1703:           }
1704:         }
1705:       }
1706:     }
1707:     PetscDrawCollectiveEnd(draw);
1708:   } else {
1709:     /* use contour shading to indicate magnitude of values */
1710:     /* first determine max of all nonzero values */
1711:     PetscReal minv = 0.0, maxv = 0.0;
1712:     PetscDraw popup;

1714:     for (i=0; i<a->nz*a->bs2; i++) {
1715:       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1716:     }
1717:     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1718:     PetscDrawGetPopup(draw,&popup);
1719:     PetscDrawScalePopup(popup,0.0,maxv);

1721:     PetscDrawCollectiveBegin(draw);
1722:     for (i=0,row=0; i<mbs; i++,row+=bs) {
1723:       for (j=a->i[i]; j<a->i[i+1]; j++) {
1724:         y_l = A->rmap->N - row - 1.0; y_r = y_l + 1.0;
1725:         x_l = a->j[j]*bs; x_r = x_l + 1.0;
1726:         aa  = a->a + j*bs2;
1727:         for (k=0; k<bs; k++) {
1728:           for (l=0; l<bs; l++) {
1729:             MatScalar v = *aa++;
1730:             color = PetscDrawRealToColor(PetscAbsScalar(v),minv,maxv);
1731:             PetscDrawRectangle(draw,x_l+k,y_l-l,x_r+k,y_r-l,color,color,color,color);
1732:           }
1733:         }
1734:       }
1735:     }
1736:     PetscDrawCollectiveEnd(draw);
1737:   }
1738:   return(0);
1739: }

1741: static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A,PetscViewer viewer)
1742: {
1744:   PetscReal      xl,yl,xr,yr,w,h;
1745:   PetscDraw      draw;
1746:   PetscBool      isnull;

1749:   PetscViewerDrawGetDraw(viewer,0,&draw);
1750:   PetscDrawIsNull(draw,&isnull);
1751:   if (isnull) return(0);

1753:   xr   = A->cmap->n; yr = A->rmap->N; h = yr/10.0; w = xr/10.0;
1754:   xr  += w;          yr += h;        xl = -w;     yl = -h;
1755:   PetscDrawSetCoordinates(draw,xl,yl,xr,yr);
1756:   PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);
1757:   PetscDrawZoom(draw,MatView_SeqBAIJ_Draw_Zoom,A);
1758:   PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);
1759:   PetscDrawSave(draw);
1760:   return(0);
1761: }

1763: PetscErrorCode MatView_SeqBAIJ(Mat A,PetscViewer viewer)
1764: {
1766:   PetscBool      iascii,isbinary,isdraw;

1769:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
1770:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);
1771:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);
1772:   if (iascii) {
1773:     MatView_SeqBAIJ_ASCII(A,viewer);
1774:   } else if (isbinary) {
1775:     MatView_SeqBAIJ_Binary(A,viewer);
1776:   } else if (isdraw) {
1777:     MatView_SeqBAIJ_Draw(A,viewer);
1778:   } else {
1779:     Mat B;
1780:     MatConvert(A,MATSEQAIJ,MAT_INITIAL_MATRIX,&B);
1781:     MatView(B,viewer);
1782:     MatDestroy(&B);
1783:   }
1784:   return(0);
1785: }

1787: PetscErrorCode MatGetValues_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1788: {
1789:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1790:   PetscInt    *rp,k,low,high,t,row,nrow,i,col,l,*aj = a->j;
1791:   PetscInt    *ai = a->i,*ailen = a->ilen;
1792:   PetscInt    brow,bcol,ridx,cidx,bs=A->rmap->bs,bs2=a->bs2;
1793:   MatScalar   *ap,*aa = a->a;

1796:   for (k=0; k<m; k++) { /* loop over rows */
1797:     row = im[k]; brow = row/bs;
1798:     if (row < 0) {v += n; continue;} /* SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative row"); */
1799:     if (row >= A->rmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D too large", row);
1800:     rp   = aj + ai[brow]; ap = aa + bs2*ai[brow];
1801:     nrow = ailen[brow];
1802:     for (l=0; l<n; l++) { /* loop over columns */
1803:       if (in[l] < 0) {v++; continue;} /* SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative column"); */
1804:       if (in[l] >= A->cmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column %D too large", in[l]);
1805:       col  = in[l];
1806:       bcol = col/bs;
1807:       cidx = col%bs;
1808:       ridx = row%bs;
1809:       high = nrow;
1810:       low  = 0; /* assume unsorted */
1811:       while (high-low > 5) {
1812:         t = (low+high)/2;
1813:         if (rp[t] > bcol) high = t;
1814:         else             low  = t;
1815:       }
1816:       for (i=low; i<high; i++) {
1817:         if (rp[i] > bcol) break;
1818:         if (rp[i] == bcol) {
1819:           *v++ = ap[bs2*i+bs*cidx+ridx];
1820:           goto finished;
1821:         }
1822:       }
1823:       *v++ = 0.0;
1824: finished:;
1825:     }
1826:   }
1827:   return(0);
1828: }

1830: PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1831: {
1832:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1833:   PetscInt          *rp,k,low,high,t,ii,jj,row,nrow,i,col,l,rmax,N,lastcol = -1;
1834:   PetscInt          *imax=a->imax,*ai=a->i,*ailen=a->ilen;
1835:   PetscErrorCode    ierr;
1836:   PetscInt          *aj        =a->j,nonew=a->nonew,bs2=a->bs2,bs=A->rmap->bs,stepval;
1837:   PetscBool         roworiented=a->roworiented;
1838:   const PetscScalar *value     = v;
1839:   MatScalar         *ap=NULL,*aa = a->a,*bap;

1842:   if (roworiented) {
1843:     stepval = (n-1)*bs;
1844:   } else {
1845:     stepval = (m-1)*bs;
1846:   }
1847:   for (k=0; k<m; k++) { /* loop over added rows */
1848:     row = im[k];
1849:     if (row < 0) continue;
1850:     if (PetscUnlikelyDebug(row >= a->mbs)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Block row index too large %D max %D",row,a->mbs-1);
1851:     rp   = aj + ai[row];
1852:     if (!A->structure_only) ap = aa + bs2*ai[row];
1853:     rmax = imax[row];
1854:     nrow = ailen[row];
1855:     low  = 0;
1856:     high = nrow;
1857:     for (l=0; l<n; l++) { /* loop over added columns */
1858:       if (in[l] < 0) continue;
1859:       if (PetscUnlikelyDebug(in[l] >= a->nbs)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Block column index too large %D max %D",in[l],a->nbs-1);
1860:       col = in[l];
1861:       if (!A->structure_only) {
1862:         if (roworiented) {
1863:           value = v + (k*(stepval+bs) + l)*bs;
1864:         } else {
1865:           value = v + (l*(stepval+bs) + k)*bs;
1866:         }
1867:       }
1868:       if (col <= lastcol) low = 0;
1869:       else high = nrow;
1870:       lastcol = col;
1871:       while (high-low > 7) {
1872:         t = (low+high)/2;
1873:         if (rp[t] > col) high = t;
1874:         else             low  = t;
1875:       }
1876:       for (i=low; i<high; i++) {
1877:         if (rp[i] > col) break;
1878:         if (rp[i] == col) {
1879:           if (A->structure_only) goto noinsert2;
1880:           bap = ap +  bs2*i;
1881:           if (roworiented) {
1882:             if (is == ADD_VALUES) {
1883:               for (ii=0; ii<bs; ii++,value+=stepval) {
1884:                 for (jj=ii; jj<bs2; jj+=bs) {
1885:                   bap[jj] += *value++;
1886:                 }
1887:               }
1888:             } else {
1889:               for (ii=0; ii<bs; ii++,value+=stepval) {
1890:                 for (jj=ii; jj<bs2; jj+=bs) {
1891:                   bap[jj] = *value++;
1892:                 }
1893:               }
1894:             }
1895:           } else {
1896:             if (is == ADD_VALUES) {
1897:               for (ii=0; ii<bs; ii++,value+=bs+stepval) {
1898:                 for (jj=0; jj<bs; jj++) {
1899:                   bap[jj] += value[jj];
1900:                 }
1901:                 bap += bs;
1902:               }
1903:             } else {
1904:               for (ii=0; ii<bs; ii++,value+=bs+stepval) {
1905:                 for (jj=0; jj<bs; jj++) {
1906:                   bap[jj]  = value[jj];
1907:                 }
1908:                 bap += bs;
1909:               }
1910:             }
1911:           }
1912:           goto noinsert2;
1913:         }
1914:       }
1915:       if (nonew == 1) goto noinsert2;
1916:       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new blocked index new nonzero block (%D, %D) in the matrix", row, col);
1917:       if (A->structure_only) {
1918:         MatSeqXAIJReallocateAIJ_structure_only(A,a->mbs,bs2,nrow,row,col,rmax,ai,aj,rp,imax,nonew,MatScalar);
1919:       } else {
1920:         MatSeqXAIJReallocateAIJ(A,a->mbs,bs2,nrow,row,col,rmax,aa,ai,aj,rp,ap,imax,nonew,MatScalar);
1921:       }
1922:       N = nrow++ - 1; high++;
1923:       /* shift up all the later entries in this row */
1924:       PetscArraymove(rp+i+1,rp+i,N-i+1);
1925:       rp[i] = col;
1926:       if (!A->structure_only) {
1927:         PetscArraymove(ap+bs2*(i+1),ap+bs2*i,bs2*(N-i+1));
1928:         bap   = ap +  bs2*i;
1929:         if (roworiented) {
1930:           for (ii=0; ii<bs; ii++,value+=stepval) {
1931:             for (jj=ii; jj<bs2; jj+=bs) {
1932:               bap[jj] = *value++;
1933:             }
1934:           }
1935:         } else {
1936:           for (ii=0; ii<bs; ii++,value+=stepval) {
1937:             for (jj=0; jj<bs; jj++) {
1938:               *bap++ = *value++;
1939:             }
1940:           }
1941:         }
1942:       }
1943: noinsert2:;
1944:       low = i;
1945:     }
1946:     ailen[row] = nrow;
1947:   }
1948:   return(0);
1949: }

1951: PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A,MatAssemblyType mode)
1952: {
1953:   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ*)A->data;
1954:   PetscInt       fshift = 0,i,*ai = a->i,*aj = a->j,*imax = a->imax;
1955:   PetscInt       m      = A->rmap->N,*ip,N,*ailen = a->ilen;
1957:   PetscInt       mbs  = a->mbs,bs2 = a->bs2,rmax = 0;
1958:   MatScalar      *aa  = a->a,*ap;
1959:   PetscReal      ratio=0.6;

1962:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);

1964:   if (m) rmax = ailen[0];
1965:   for (i=1; i<mbs; i++) {
1966:     /* move each row back by the amount of empty slots (fshift) before it*/
1967:     fshift += imax[i-1] - ailen[i-1];
1968:     rmax    = PetscMax(rmax,ailen[i]);
1969:     if (fshift) {
1970:       ip = aj + ai[i];
1971:       ap = aa + bs2*ai[i];
1972:       N  = ailen[i];
1973:       PetscArraymove(ip-fshift,ip,N);
1974:       if (!A->structure_only) {
1975:         PetscArraymove(ap-bs2*fshift,ap,bs2*N);
1976:       }
1977:     }
1978:     ai[i] = ai[i-1] + ailen[i-1];
1979:   }
1980:   if (mbs) {
1981:     fshift += imax[mbs-1] - ailen[mbs-1];
1982:     ai[mbs] = ai[mbs-1] + ailen[mbs-1];
1983:   }

1985:   /* reset ilen and imax for each row */
1986:   a->nonzerorowcnt = 0;
1987:   if (A->structure_only) {
1988:     PetscFree2(a->imax,a->ilen);
1989:   } else { /* !A->structure_only */
1990:     for (i=0; i<mbs; i++) {
1991:       ailen[i] = imax[i] = ai[i+1] - ai[i];
1992:       a->nonzerorowcnt += ((ai[i+1] - ai[i]) > 0);
1993:     }
1994:   }
1995:   a->nz = ai[mbs];

1997:   /* diagonals may have moved, so kill the diagonal pointers */
1998:   a->idiagvalid = PETSC_FALSE;
1999:   if (fshift && a->diag) {
2000:     PetscFree(a->diag);
2001:     PetscLogObjectMemory((PetscObject)A,-(mbs+1)*sizeof(PetscInt));
2002:     a->diag = NULL;
2003:   }
2004:   if (fshift && a->nounused == -1) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_PLIB, "Unused space detected in matrix: %D X %D block size %D, %D unneeded", m, A->cmap->n, A->rmap->bs, fshift*bs2);
2005:   PetscInfo5(A,"Matrix size: %D X %D, block size %D; storage space: %D unneeded, %D used\n",m,A->cmap->n,A->rmap->bs,fshift*bs2,a->nz*bs2);
2006:   PetscInfo1(A,"Number of mallocs during MatSetValues is %D\n",a->reallocs);
2007:   PetscInfo1(A,"Most nonzeros blocks in any row is %D\n",rmax);

2009:   A->info.mallocs    += a->reallocs;
2010:   a->reallocs         = 0;
2011:   A->info.nz_unneeded = (PetscReal)fshift*bs2;
2012:   a->rmax             = rmax;

2014:   if (!A->structure_only) {
2015:     MatCheckCompressedRow(A,a->nonzerorowcnt,&a->compressedrow,a->i,mbs,ratio);
2016:   }
2017:   return(0);
2018: }

2020: /*
2021:    This function returns an array of flags which indicate the locations of contiguous
2022:    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2023:    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2024:    Assume: sizes should be long enough to hold all the values.
2025: */
2026: static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[],PetscInt n,PetscInt bs,PetscInt sizes[], PetscInt *bs_max)
2027: {
2028:   PetscInt  i,j,k,row;
2029:   PetscBool flg;

2032:   for (i=0,j=0; i<n; j++) {
2033:     row = idx[i];
2034:     if (row%bs!=0) { /* Not the beginning of a block */
2035:       sizes[j] = 1;
2036:       i++;
2037:     } else if (i+bs > n) { /* complete block doesn't exist (at idx end) */
2038:       sizes[j] = 1;         /* Also makes sure atleast 'bs' values exist for next else */
2039:       i++;
2040:     } else { /* Begining of the block, so check if the complete block exists */
2041:       flg = PETSC_TRUE;
2042:       for (k=1; k<bs; k++) {
2043:         if (row+k != idx[i+k]) { /* break in the block */
2044:           flg = PETSC_FALSE;
2045:           break;
2046:         }
2047:       }
2048:       if (flg) { /* No break in the bs */
2049:         sizes[j] = bs;
2050:         i       += bs;
2051:       } else {
2052:         sizes[j] = 1;
2053:         i++;
2054:       }
2055:     }
2056:   }
2057:   *bs_max = j;
2058:   return(0);
2059: }

2061: PetscErrorCode MatZeroRows_SeqBAIJ(Mat A,PetscInt is_n,const PetscInt is_idx[],PetscScalar diag,Vec x, Vec b)
2062: {
2063:   Mat_SeqBAIJ       *baij=(Mat_SeqBAIJ*)A->data;
2064:   PetscErrorCode    ierr;
2065:   PetscInt          i,j,k,count,*rows;
2066:   PetscInt          bs=A->rmap->bs,bs2=baij->bs2,*sizes,row,bs_max;
2067:   PetscScalar       zero = 0.0;
2068:   MatScalar         *aa;
2069:   const PetscScalar *xx;
2070:   PetscScalar       *bb;

2073:   /* fix right hand side if needed */
2074:   if (x && b) {
2075:     VecGetArrayRead(x,&xx);
2076:     VecGetArray(b,&bb);
2077:     for (i=0; i<is_n; i++) {
2078:       bb[is_idx[i]] = diag*xx[is_idx[i]];
2079:     }
2080:     VecRestoreArrayRead(x,&xx);
2081:     VecRestoreArray(b,&bb);
2082:   }

2084:   /* Make a copy of the IS and  sort it */
2085:   /* allocate memory for rows,sizes */
2086:   PetscMalloc2(is_n,&rows,2*is_n,&sizes);

2088:   /* copy IS values to rows, and sort them */
2089:   for (i=0; i<is_n; i++) rows[i] = is_idx[i];
2090:   PetscSortInt(is_n,rows);

2092:   if (baij->keepnonzeropattern) {
2093:     for (i=0; i<is_n; i++) sizes[i] = 1;
2094:     bs_max          = is_n;
2095:   } else {
2096:     MatZeroRows_SeqBAIJ_Check_Blocks(rows,is_n,bs,sizes,&bs_max);
2097:     A->nonzerostate++;
2098:   }

2100:   for (i=0,j=0; i<bs_max; j+=sizes[i],i++) {
2101:     row = rows[j];
2102:     if (row < 0 || row > A->rmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"row %D out of range",row);
2103:     count = (baij->i[row/bs +1] - baij->i[row/bs])*bs;
2104:     aa    = ((MatScalar*)(baij->a)) + baij->i[row/bs]*bs2 + (row%bs);
2105:     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2106:       if (diag != (PetscScalar)0.0) {
2107:         if (baij->ilen[row/bs] > 0) {
2108:           baij->ilen[row/bs]       = 1;
2109:           baij->j[baij->i[row/bs]] = row/bs;

2111:           PetscArrayzero(aa,count*bs);
2112:         }
2113:         /* Now insert all the diagonal values for this bs */
2114:         for (k=0; k<bs; k++) {
2115:           (*A->ops->setvalues)(A,1,rows+j+k,1,rows+j+k,&diag,INSERT_VALUES);
2116:         }
2117:       } else { /* (diag == 0.0) */
2118:         baij->ilen[row/bs] = 0;
2119:       } /* end (diag == 0.0) */
2120:     } else { /* (sizes[i] != bs) */
2121:       if (PetscUnlikelyDebug(sizes[i] != 1)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal Error. Value should be 1");
2122:       for (k=0; k<count; k++) {
2123:         aa[0] =  zero;
2124:         aa   += bs;
2125:       }
2126:       if (diag != (PetscScalar)0.0) {
2127:         (*A->ops->setvalues)(A,1,rows+j,1,rows+j,&diag,INSERT_VALUES);
2128:       }
2129:     }
2130:   }

2132:   PetscFree2(rows,sizes);
2133:   MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY);
2134:   return(0);
2135: }

2137: PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A,PetscInt is_n,const PetscInt is_idx[],PetscScalar diag,Vec x, Vec b)
2138: {
2139:   Mat_SeqBAIJ       *baij=(Mat_SeqBAIJ*)A->data;
2140:   PetscErrorCode    ierr;
2141:   PetscInt          i,j,k,count;
2142:   PetscInt          bs   =A->rmap->bs,bs2=baij->bs2,row,col;
2143:   PetscScalar       zero = 0.0;
2144:   MatScalar         *aa;
2145:   const PetscScalar *xx;
2146:   PetscScalar       *bb;
2147:   PetscBool         *zeroed,vecs = PETSC_FALSE;

2150:   /* fix right hand side if needed */
2151:   if (x && b) {
2152:     VecGetArrayRead(x,&xx);
2153:     VecGetArray(b,&bb);
2154:     vecs = PETSC_TRUE;
2155:   }

2157:   /* zero the columns */
2158:   PetscCalloc1(A->rmap->n,&zeroed);
2159:   for (i=0; i<is_n; i++) {
2160:     if (is_idx[i] < 0 || is_idx[i] >= A->rmap->N) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"row %D out of range",is_idx[i]);
2161:     zeroed[is_idx[i]] = PETSC_TRUE;
2162:   }
2163:   for (i=0; i<A->rmap->N; i++) {
2164:     if (!zeroed[i]) {
2165:       row = i/bs;
2166:       for (j=baij->i[row]; j<baij->i[row+1]; j++) {
2167:         for (k=0; k<bs; k++) {
2168:           col = bs*baij->j[j] + k;
2169:           if (zeroed[col]) {
2170:             aa = ((MatScalar*)(baij->a)) + j*bs2 + (i%bs) + bs*k;
2171:             if (vecs) bb[i] -= aa[0]*xx[col];
2172:             aa[0] = 0.0;
2173:           }
2174:         }
2175:       }
2176:     } else if (vecs) bb[i] = diag*xx[i];
2177:   }
2178:   PetscFree(zeroed);
2179:   if (vecs) {
2180:     VecRestoreArrayRead(x,&xx);
2181:     VecRestoreArray(b,&bb);
2182:   }

2184:   /* zero the rows */
2185:   for (i=0; i<is_n; i++) {
2186:     row   = is_idx[i];
2187:     count = (baij->i[row/bs +1] - baij->i[row/bs])*bs;
2188:     aa    = ((MatScalar*)(baij->a)) + baij->i[row/bs]*bs2 + (row%bs);
2189:     for (k=0; k<count; k++) {
2190:       aa[0] =  zero;
2191:       aa   += bs;
2192:     }
2193:     if (diag != (PetscScalar)0.0) {
2194:       (*A->ops->setvalues)(A,1,&row,1,&row,&diag,INSERT_VALUES);
2195:     }
2196:   }
2197:   MatAssemblyEnd_SeqBAIJ(A,MAT_FINAL_ASSEMBLY);
2198:   return(0);
2199: }

2201: PetscErrorCode MatSetValues_SeqBAIJ(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
2202: {
2203:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2204:   PetscInt       *rp,k,low,high,t,ii,row,nrow,i,col,l,rmax,N,lastcol = -1;
2205:   PetscInt       *imax=a->imax,*ai=a->i,*ailen=a->ilen;
2206:   PetscInt       *aj  =a->j,nonew=a->nonew,bs=A->rmap->bs,brow,bcol;
2208:   PetscInt       ridx,cidx,bs2=a->bs2;
2209:   PetscBool      roworiented=a->roworiented;
2210:   MatScalar      *ap=NULL,value=0.0,*aa=a->a,*bap;

2213:   for (k=0; k<m; k++) { /* loop over added rows */
2214:     row  = im[k];
2215:     brow = row/bs;
2216:     if (row < 0) continue;
2217:     if (PetscUnlikelyDebug(row >= A->rmap->N)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->N-1);
2218:     rp   = aj + ai[brow];
2219:     if (!A->structure_only) ap = aa + bs2*ai[brow];
2220:     rmax = imax[brow];
2221:     nrow = ailen[brow];
2222:     low  = 0;
2223:     high = nrow;
2224:     for (l=0; l<n; l++) { /* loop over added columns */
2225:       if (in[l] < 0) continue;
2226:       if (PetscUnlikelyDebug(in[l] >= A->cmap->n)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: col %D max %D",in[l],A->cmap->n-1);
2227:       col  = in[l]; bcol = col/bs;
2228:       ridx = row % bs; cidx = col % bs;
2229:       if (!A->structure_only) {
2230:         if (roworiented) {
2231:           value = v[l + k*n];
2232:         } else {
2233:           value = v[k + l*m];
2234:         }
2235:       }
2236:       if (col <= lastcol) low = 0; else high = nrow;
2237:       lastcol = col;
2238:       while (high-low > 7) {
2239:         t = (low+high)/2;
2240:         if (rp[t] > bcol) high = t;
2241:         else              low  = t;
2242:       }
2243:       for (i=low; i<high; i++) {
2244:         if (rp[i] > bcol) break;
2245:         if (rp[i] == bcol) {
2246:           bap = ap +  bs2*i + bs*cidx + ridx;
2247:           if (!A->structure_only) {
2248:             if (is == ADD_VALUES) *bap += value;
2249:             else                  *bap  = value;
2250:           }
2251:           goto noinsert1;
2252:         }
2253:       }
2254:       if (nonew == 1) goto noinsert1;
2255:       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
2256:       if (A->structure_only) {
2257:         MatSeqXAIJReallocateAIJ_structure_only(A,a->mbs,bs2,nrow,brow,bcol,rmax,ai,aj,rp,imax,nonew,MatScalar);
2258:       } else {
2259:         MatSeqXAIJReallocateAIJ(A,a->mbs,bs2,nrow,brow,bcol,rmax,aa,ai,aj,rp,ap,imax,nonew,MatScalar);
2260:       }
2261:       N = nrow++ - 1; high++;
2262:       /* shift up all the later entries in this row */
2263:       PetscArraymove(rp+i+1,rp+i,N-i+1);
2264:       rp[i] = bcol;
2265:       if (!A->structure_only) {
2266:         PetscArraymove(ap+bs2*(i+1),ap+bs2*i,bs2*(N-i+1));
2267:         PetscArrayzero(ap+bs2*i,bs2);
2268:         ap[bs2*i + bs*cidx + ridx] = value;
2269:       }
2270:       a->nz++;
2271:       A->nonzerostate++;
2272: noinsert1:;
2273:       low = i;
2274:     }
2275:     ailen[brow] = nrow;
2276:   }
2277:   return(0);
2278: }

2280: PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA,IS row,IS col,const MatFactorInfo *info)
2281: {
2282:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
2283:   Mat            outA;
2285:   PetscBool      row_identity,col_identity;

2288:   if (info->levels != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only levels = 0 supported for in-place ILU");
2289:   ISIdentity(row,&row_identity);
2290:   ISIdentity(col,&col_identity);
2291:   if (!row_identity || !col_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Row and column permutations must be identity for in-place ILU");

2293:   outA            = inA;
2294:   inA->factortype = MAT_FACTOR_LU;
2295:   PetscFree(inA->solvertype);
2296:   PetscStrallocpy(MATSOLVERPETSC,&inA->solvertype);

2298:   MatMarkDiagonal_SeqBAIJ(inA);

2300:   PetscObjectReference((PetscObject)row);
2301:   ISDestroy(&a->row);
2302:   a->row = row;
2303:   PetscObjectReference((PetscObject)col);
2304:   ISDestroy(&a->col);
2305:   a->col = col;

2307:   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2308:   ISDestroy(&a->icol);
2309:   ISInvertPermutation(col,PETSC_DECIDE,&a->icol);
2310:   PetscLogObjectParent((PetscObject)inA,(PetscObject)a->icol);

2312:   MatSeqBAIJSetNumericFactorization_inplace(inA,(PetscBool)(row_identity && col_identity));
2313:   if (!a->solve_work) {
2314:     PetscMalloc1(inA->rmap->N+inA->rmap->bs,&a->solve_work);
2315:     PetscLogObjectMemory((PetscObject)inA,(inA->rmap->N+inA->rmap->bs)*sizeof(PetscScalar));
2316:   }
2317:   MatLUFactorNumeric(outA,inA,info);
2318:   return(0);
2319: }

2321: PetscErrorCode  MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat,PetscInt *indices)
2322: {
2323:   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ*)mat->data;
2324:   PetscInt    i,nz,mbs;

2327:   nz  = baij->maxnz;
2328:   mbs = baij->mbs;
2329:   for (i=0; i<nz; i++) {
2330:     baij->j[i] = indices[i];
2331:   }
2332:   baij->nz = nz;
2333:   for (i=0; i<mbs; i++) {
2334:     baij->ilen[i] = baij->imax[i];
2335:   }
2336:   return(0);
2337: }

2339: /*@
2340:     MatSeqBAIJSetColumnIndices - Set the column indices for all the rows
2341:        in the matrix.

2343:   Input Parameters:
2344: +  mat - the SeqBAIJ matrix
2345: -  indices - the column indices

2347:   Level: advanced

2349:   Notes:
2350:     This can be called if you have precomputed the nonzero structure of the
2351:   matrix and want to provide it to the matrix object to improve the performance
2352:   of the MatSetValues() operation.

2354:     You MUST have set the correct numbers of nonzeros per row in the call to
2355:   MatCreateSeqBAIJ(), and the columns indices MUST be sorted.

2357:     MUST be called before any calls to MatSetValues();

2359: @*/
2360: PetscErrorCode  MatSeqBAIJSetColumnIndices(Mat mat,PetscInt *indices)
2361: {

2367:   PetscUseMethod(mat,"MatSeqBAIJSetColumnIndices_C",(Mat,PetscInt*),(mat,indices));
2368:   return(0);
2369: }

2371: PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A,Vec v,PetscInt idx[])
2372: {
2373:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2375:   PetscInt       i,j,n,row,bs,*ai,*aj,mbs;
2376:   PetscReal      atmp;
2377:   PetscScalar    *x,zero = 0.0;
2378:   MatScalar      *aa;
2379:   PetscInt       ncols,brow,krow,kcol;

2382:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2383:   bs  = A->rmap->bs;
2384:   aa  = a->a;
2385:   ai  = a->i;
2386:   aj  = a->j;
2387:   mbs = a->mbs;

2389:   VecSet(v,zero);
2390:   VecGetArray(v,&x);
2391:   VecGetLocalSize(v,&n);
2392:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2393:   for (i=0; i<mbs; i++) {
2394:     ncols = ai[1] - ai[0]; ai++;
2395:     brow  = bs*i;
2396:     for (j=0; j<ncols; j++) {
2397:       for (kcol=0; kcol<bs; kcol++) {
2398:         for (krow=0; krow<bs; krow++) {
2399:           atmp = PetscAbsScalar(*aa);aa++;
2400:           row  = brow + krow;   /* row index */
2401:           if (PetscAbsScalar(x[row]) < atmp) {x[row] = atmp; if (idx) idx[row] = bs*(*aj) + kcol;}
2402:         }
2403:       }
2404:       aj++;
2405:     }
2406:   }
2407:   VecRestoreArray(v,&x);
2408:   return(0);
2409: }

2411: PetscErrorCode MatCopy_SeqBAIJ(Mat A,Mat B,MatStructure str)
2412: {

2416:   /* If the two matrices have the same copy implementation, use fast copy. */
2417:   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2418:     Mat_SeqBAIJ *a  = (Mat_SeqBAIJ*)A->data;
2419:     Mat_SeqBAIJ *b  = (Mat_SeqBAIJ*)B->data;
2420:     PetscInt    ambs=a->mbs,bmbs=b->mbs,abs=A->rmap->bs,bbs=B->rmap->bs,bs2=abs*abs;

2422:     if (a->i[ambs] != b->i[bmbs]) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzero blocks in matrices A %D and B %D are different",a->i[ambs],b->i[bmbs]);
2423:     if (abs != bbs) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Block size A %D and B %D are different",abs,bbs);
2424:     PetscArraycpy(b->a,a->a,bs2*a->i[ambs]);
2425:     PetscObjectStateIncrease((PetscObject)B);
2426:   } else {
2427:     MatCopy_Basic(A,B,str);
2428:   }
2429:   return(0);
2430: }

2432: PetscErrorCode MatSetUp_SeqBAIJ(Mat A)
2433: {

2437:   MatSeqBAIJSetPreallocation(A,A->rmap->bs,PETSC_DEFAULT,NULL);
2438:   return(0);
2439: }

2441: static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A,PetscScalar *array[])
2442: {
2443:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2446:   *array = a->a;
2447:   return(0);
2448: }

2450: static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A,PetscScalar *array[])
2451: {
2453:   *array = NULL;
2454:   return(0);
2455: }

2457: PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y,Mat X,PetscInt *nnz)
2458: {
2459:   PetscInt       bs = Y->rmap->bs,mbs = Y->rmap->N/bs;
2460:   Mat_SeqBAIJ    *x = (Mat_SeqBAIJ*)X->data;
2461:   Mat_SeqBAIJ    *y = (Mat_SeqBAIJ*)Y->data;

2465:   /* Set the number of nonzeros in the new matrix */
2466:   MatAXPYGetPreallocation_SeqX_private(mbs,x->i,x->j,y->i,y->j,nnz);
2467:   return(0);
2468: }

2470: PetscErrorCode MatAXPY_SeqBAIJ(Mat Y,PetscScalar a,Mat X,MatStructure str)
2471: {
2472:   Mat_SeqBAIJ    *x = (Mat_SeqBAIJ*)X->data,*y = (Mat_SeqBAIJ*)Y->data;
2474:   PetscInt       bs=Y->rmap->bs,bs2=bs*bs;
2475:   PetscBLASInt   one=1;

2478:   if (str == SAME_NONZERO_PATTERN) {
2479:     PetscScalar  alpha = a;
2480:     PetscBLASInt bnz;
2481:     PetscBLASIntCast(x->nz*bs2,&bnz);
2482:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&bnz,&alpha,x->a,&one,y->a,&one));
2483:     PetscObjectStateIncrease((PetscObject)Y);
2484:   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2485:     MatAXPY_Basic(Y,a,X,str);
2486:   } else {
2487:     Mat      B;
2488:     PetscInt *nnz;
2489:     if (bs != X->rmap->bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Matrices must have same block size");
2490:     PetscMalloc1(Y->rmap->N,&nnz);
2491:     MatCreate(PetscObjectComm((PetscObject)Y),&B);
2492:     PetscObjectSetName((PetscObject)B,((PetscObject)Y)->name);
2493:     MatSetSizes(B,Y->rmap->n,Y->cmap->n,Y->rmap->N,Y->cmap->N);
2494:     MatSetBlockSizesFromMats(B,Y,Y);
2495:     MatSetType(B,(MatType) ((PetscObject)Y)->type_name);
2496:     MatAXPYGetPreallocation_SeqBAIJ(Y,X,nnz);
2497:     MatSeqBAIJSetPreallocation(B,bs,0,nnz);
2498:     MatAXPY_BasicWithPreallocation(B,Y,a,X,str);
2499:     MatHeaderMerge(Y,&B);
2500:     PetscFree(nnz);
2501:   }
2502:   return(0);
2503: }

2505: PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2506: {
2507: #if defined(PETSC_USE_COMPLEX)
2508:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2509:   PetscInt    i,nz = a->bs2*a->i[a->mbs];
2510:   MatScalar   *aa = a->a;

2513:   for (i=0; i<nz; i++) aa[i] = PetscConj(aa[i]);
2514: #else
2516: #endif
2517:   return(0);
2518: }

2520: PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2521: {
2522:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2523:   PetscInt    i,nz = a->bs2*a->i[a->mbs];
2524:   MatScalar   *aa = a->a;

2527:   for (i=0; i<nz; i++) aa[i] = PetscRealPart(aa[i]);
2528:   return(0);
2529: }

2531: PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2532: {
2533:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2534:   PetscInt    i,nz = a->bs2*a->i[a->mbs];
2535:   MatScalar   *aa = a->a;

2538:   for (i=0; i<nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2539:   return(0);
2540: }

2542: /*
2543:     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2544: */
2545: PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
2546: {
2547:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2549:   PetscInt       bs = A->rmap->bs,i,*collengths,*cia,*cja,n = A->cmap->n/bs,m = A->rmap->n/bs;
2550:   PetscInt       nz = a->i[m],row,*jj,mr,col;

2553:   *nn = n;
2554:   if (!ia) return(0);
2555:   if (symmetric) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not for BAIJ matrices");
2556:   else {
2557:     PetscCalloc1(n,&collengths);
2558:     PetscMalloc1(n+1,&cia);
2559:     PetscMalloc1(nz,&cja);
2560:     jj   = a->j;
2561:     for (i=0; i<nz; i++) {
2562:       collengths[jj[i]]++;
2563:     }
2564:     cia[0] = oshift;
2565:     for (i=0; i<n; i++) {
2566:       cia[i+1] = cia[i] + collengths[i];
2567:     }
2568:     PetscArrayzero(collengths,n);
2569:     jj   = a->j;
2570:     for (row=0; row<m; row++) {
2571:       mr = a->i[row+1] - a->i[row];
2572:       for (i=0; i<mr; i++) {
2573:         col = *jj++;

2575:         cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2576:       }
2577:     }
2578:     PetscFree(collengths);
2579:     *ia  = cia; *ja = cja;
2580:   }
2581:   return(0);
2582: }

2584: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
2585: {

2589:   if (!ia) return(0);
2590:   PetscFree(*ia);
2591:   PetscFree(*ja);
2592:   return(0);
2593: }

2595: /*
2596:  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2597:  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2598:  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2599:  */
2600: PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *nn,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
2601: {
2602:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2604:   PetscInt       i,*collengths,*cia,*cja,n=a->nbs,m=a->mbs;
2605:   PetscInt       nz = a->i[m],row,*jj,mr,col;
2606:   PetscInt       *cspidx;

2609:   *nn = n;
2610:   if (!ia) return(0);

2612:   PetscCalloc1(n,&collengths);
2613:   PetscMalloc1(n+1,&cia);
2614:   PetscMalloc1(nz,&cja);
2615:   PetscMalloc1(nz,&cspidx);
2616:   jj   = a->j;
2617:   for (i=0; i<nz; i++) {
2618:     collengths[jj[i]]++;
2619:   }
2620:   cia[0] = oshift;
2621:   for (i=0; i<n; i++) {
2622:     cia[i+1] = cia[i] + collengths[i];
2623:   }
2624:   PetscArrayzero(collengths,n);
2625:   jj   = a->j;
2626:   for (row=0; row<m; row++) {
2627:     mr = a->i[row+1] - a->i[row];
2628:     for (i=0; i<mr; i++) {
2629:       col = *jj++;
2630:       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2631:       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2632:     }
2633:   }
2634:   PetscFree(collengths);
2635:   *ia    = cia;
2636:   *ja    = cja;
2637:   *spidx = cspidx;
2638:   return(0);
2639: }

2641: PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool inodecompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscInt *spidx[],PetscBool  *done)
2642: {

2646:   MatRestoreColumnIJ_SeqBAIJ(A,oshift,symmetric,inodecompressed,n,ia,ja,done);
2647:   PetscFree(*spidx);
2648:   return(0);
2649: }

2651: PetscErrorCode MatShift_SeqBAIJ(Mat Y,PetscScalar a)
2652: {
2654:   Mat_SeqBAIJ     *aij = (Mat_SeqBAIJ*)Y->data;

2657:   if (!Y->preallocated || !aij->nz) {
2658:     MatSeqBAIJSetPreallocation(Y,Y->rmap->bs,1,NULL);
2659:   }
2660:   MatShift_Basic(Y,a);
2661:   return(0);
2662: }

2664: /* -------------------------------------------------------------------*/
2665: static struct _MatOps MatOps_Values = {MatSetValues_SeqBAIJ,
2666:                                        MatGetRow_SeqBAIJ,
2667:                                        MatRestoreRow_SeqBAIJ,
2668:                                        MatMult_SeqBAIJ_N,
2669:                                /* 4*/  MatMultAdd_SeqBAIJ_N,
2670:                                        MatMultTranspose_SeqBAIJ,
2671:                                        MatMultTransposeAdd_SeqBAIJ,
2672:                                        NULL,
2673:                                        NULL,
2674:                                        NULL,
2675:                                /* 10*/ NULL,
2676:                                        MatLUFactor_SeqBAIJ,
2677:                                        NULL,
2678:                                        NULL,
2679:                                        MatTranspose_SeqBAIJ,
2680:                                /* 15*/ MatGetInfo_SeqBAIJ,
2681:                                        MatEqual_SeqBAIJ,
2682:                                        MatGetDiagonal_SeqBAIJ,
2683:                                        MatDiagonalScale_SeqBAIJ,
2684:                                        MatNorm_SeqBAIJ,
2685:                                /* 20*/ NULL,
2686:                                        MatAssemblyEnd_SeqBAIJ,
2687:                                        MatSetOption_SeqBAIJ,
2688:                                        MatZeroEntries_SeqBAIJ,
2689:                                /* 24*/ MatZeroRows_SeqBAIJ,
2690:                                        NULL,
2691:                                        NULL,
2692:                                        NULL,
2693:                                        NULL,
2694:                                /* 29*/ MatSetUp_SeqBAIJ,
2695:                                        NULL,
2696:                                        NULL,
2697:                                        NULL,
2698:                                        NULL,
2699:                                /* 34*/ MatDuplicate_SeqBAIJ,
2700:                                        NULL,
2701:                                        NULL,
2702:                                        MatILUFactor_SeqBAIJ,
2703:                                        NULL,
2704:                                /* 39*/ MatAXPY_SeqBAIJ,
2705:                                        MatCreateSubMatrices_SeqBAIJ,
2706:                                        MatIncreaseOverlap_SeqBAIJ,
2707:                                        MatGetValues_SeqBAIJ,
2708:                                        MatCopy_SeqBAIJ,
2709:                                /* 44*/ NULL,
2710:                                        MatScale_SeqBAIJ,
2711:                                        MatShift_SeqBAIJ,
2712:                                        NULL,
2713:                                        MatZeroRowsColumns_SeqBAIJ,
2714:                                /* 49*/ NULL,
2715:                                        MatGetRowIJ_SeqBAIJ,
2716:                                        MatRestoreRowIJ_SeqBAIJ,
2717:                                        MatGetColumnIJ_SeqBAIJ,
2718:                                        MatRestoreColumnIJ_SeqBAIJ,
2719:                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
2720:                                        NULL,
2721:                                        NULL,
2722:                                        NULL,
2723:                                        MatSetValuesBlocked_SeqBAIJ,
2724:                                /* 59*/ MatCreateSubMatrix_SeqBAIJ,
2725:                                        MatDestroy_SeqBAIJ,
2726:                                        MatView_SeqBAIJ,
2727:                                        NULL,
2728:                                        NULL,
2729:                                /* 64*/ NULL,
2730:                                        NULL,
2731:                                        NULL,
2732:                                        NULL,
2733:                                        NULL,
2734:                                /* 69*/ MatGetRowMaxAbs_SeqBAIJ,
2735:                                        NULL,
2736:                                        MatConvert_Basic,
2737:                                        NULL,
2738:                                        NULL,
2739:                                /* 74*/ NULL,
2740:                                        MatFDColoringApply_BAIJ,
2741:                                        NULL,
2742:                                        NULL,
2743:                                        NULL,
2744:                                /* 79*/ NULL,
2745:                                        NULL,
2746:                                        NULL,
2747:                                        NULL,
2748:                                        MatLoad_SeqBAIJ,
2749:                                /* 84*/ NULL,
2750:                                        NULL,
2751:                                        NULL,
2752:                                        NULL,
2753:                                        NULL,
2754:                                /* 89*/ NULL,
2755:                                        NULL,
2756:                                        NULL,
2757:                                        NULL,
2758:                                        NULL,
2759:                                /* 94*/ NULL,
2760:                                        NULL,
2761:                                        NULL,
2762:                                        NULL,
2763:                                        NULL,
2764:                                /* 99*/ NULL,
2765:                                        NULL,
2766:                                        NULL,
2767:                                        MatConjugate_SeqBAIJ,
2768:                                        NULL,
2769:                                /*104*/ NULL,
2770:                                        MatRealPart_SeqBAIJ,
2771:                                        MatImaginaryPart_SeqBAIJ,
2772:                                        NULL,
2773:                                        NULL,
2774:                                /*109*/ NULL,
2775:                                        NULL,
2776:                                        NULL,
2777:                                        NULL,
2778:                                        MatMissingDiagonal_SeqBAIJ,
2779:                                /*114*/ NULL,
2780:                                        NULL,
2781:                                        NULL,
2782:                                        NULL,
2783:                                        NULL,
2784:                                /*119*/ NULL,
2785:                                        NULL,
2786:                                        MatMultHermitianTranspose_SeqBAIJ,
2787:                                        MatMultHermitianTransposeAdd_SeqBAIJ,
2788:                                        NULL,
2789:                                /*124*/ NULL,
2790:                                        MatGetColumnReductions_SeqBAIJ,
2791:                                        MatInvertBlockDiagonal_SeqBAIJ,
2792:                                        NULL,
2793:                                        NULL,
2794:                                /*129*/ NULL,
2795:                                        NULL,
2796:                                        NULL,
2797:                                        NULL,
2798:                                        NULL,
2799:                                /*134*/ NULL,
2800:                                        NULL,
2801:                                        NULL,
2802:                                        NULL,
2803:                                        NULL,
2804:                                /*139*/ MatSetBlockSizes_Default,
2805:                                        NULL,
2806:                                        NULL,
2807:                                        MatFDColoringSetUp_SeqXAIJ,
2808:                                        NULL,
2809:                                 /*144*/MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
2810:                                        MatDestroySubMatrices_SeqBAIJ
2811: };

2813: PetscErrorCode  MatStoreValues_SeqBAIJ(Mat mat)
2814: {
2815:   Mat_SeqBAIJ    *aij = (Mat_SeqBAIJ*)mat->data;
2816:   PetscInt       nz   = aij->i[aij->mbs]*aij->bs2;

2820:   if (aij->nonew != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");

2822:   /* allocate space for values if not already there */
2823:   if (!aij->saved_values) {
2824:     PetscMalloc1(nz+1,&aij->saved_values);
2825:     PetscLogObjectMemory((PetscObject)mat,(nz+1)*sizeof(PetscScalar));
2826:   }

2828:   /* copy values over */
2829:   PetscArraycpy(aij->saved_values,aij->a,nz);
2830:   return(0);
2831: }

2833: PetscErrorCode  MatRetrieveValues_SeqBAIJ(Mat mat)
2834: {
2835:   Mat_SeqBAIJ    *aij = (Mat_SeqBAIJ*)mat->data;
2837:   PetscInt       nz = aij->i[aij->mbs]*aij->bs2;

2840:   if (aij->nonew != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
2841:   if (!aij->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");

2843:   /* copy values over */
2844:   PetscArraycpy(aij->a,aij->saved_values,nz);
2845:   return(0);
2846: }

2848: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType,MatReuse,Mat*);
2849: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType,MatReuse,Mat*);

2851: PetscErrorCode  MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B,PetscInt bs,PetscInt nz,PetscInt *nnz)
2852: {
2853:   Mat_SeqBAIJ    *b;
2855:   PetscInt       i,mbs,nbs,bs2;
2856:   PetscBool      flg = PETSC_FALSE,skipallocation = PETSC_FALSE,realalloc = PETSC_FALSE;

2859:   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
2860:   if (nz == MAT_SKIP_ALLOCATION) {
2861:     skipallocation = PETSC_TRUE;
2862:     nz             = 0;
2863:   }

2865:   MatSetBlockSize(B,PetscAbs(bs));
2866:   PetscLayoutSetUp(B->rmap);
2867:   PetscLayoutSetUp(B->cmap);
2868:   PetscLayoutGetBlockSize(B->rmap,&bs);

2870:   B->preallocated = PETSC_TRUE;

2872:   mbs = B->rmap->n/bs;
2873:   nbs = B->cmap->n/bs;
2874:   bs2 = bs*bs;

2876:   if (mbs*bs!=B->rmap->n || nbs*bs!=B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows %D, cols %D must be divisible by blocksize %D",B->rmap->N,B->cmap->n,bs);

2878:   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
2879:   if (nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nz cannot be less than 0: value %D",nz);
2880:   if (nnz) {
2881:     for (i=0; i<mbs; i++) {
2882:       if (nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nnz cannot be less than 0: local row %D value %D",i,nnz[i]);
2883:       if (nnz[i] > nbs) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nnz cannot be greater than block row length: local row %D value %D rowlength %D",i,nnz[i],nbs);
2884:     }
2885:   }

2887:   b    = (Mat_SeqBAIJ*)B->data;
2888:   PetscOptionsBegin(PetscObjectComm((PetscObject)B),NULL,"Optimize options for SEQBAIJ matrix 2 ","Mat");
2889:   PetscOptionsBool("-mat_no_unroll","Do not optimize for block size (slow)",NULL,flg,&flg,NULL);
2890:   PetscOptionsEnd();

2892:   if (!flg) {
2893:     switch (bs) {
2894:     case 1:
2895:       B->ops->mult    = MatMult_SeqBAIJ_1;
2896:       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
2897:       break;
2898:     case 2:
2899:       B->ops->mult    = MatMult_SeqBAIJ_2;
2900:       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
2901:       break;
2902:     case 3:
2903:       B->ops->mult    = MatMult_SeqBAIJ_3;
2904:       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
2905:       break;
2906:     case 4:
2907:       B->ops->mult    = MatMult_SeqBAIJ_4;
2908:       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
2909:       break;
2910:     case 5:
2911:       B->ops->mult    = MatMult_SeqBAIJ_5;
2912:       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
2913:       break;
2914:     case 6:
2915:       B->ops->mult    = MatMult_SeqBAIJ_6;
2916:       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
2917:       break;
2918:     case 7:
2919:       B->ops->mult    = MatMult_SeqBAIJ_7;
2920:       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
2921:       break;
2922:     case 9:
2923:     {
2924:       PetscInt version = 1;
2925:       PetscOptionsGetInt(NULL,((PetscObject)B)->prefix,"-mat_baij_mult_version",&version,NULL);
2926:       switch (version) {
2927: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2928:       case 1:
2929:         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
2930:         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
2931:         PetscInfo1((PetscObject)B,"Using AVX2 for MatMult for BAIJ for blocksize %D\n",bs);
2932:         break;
2933: #endif
2934:       default:
2935:         B->ops->mult    = MatMult_SeqBAIJ_N;
2936:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
2937:         PetscInfo1((PetscObject)B,"Using BLAS for MatMult for BAIJ for blocksize %D\n",bs);
2938:         break;
2939:       }
2940:       break;
2941:     }
2942:     case 11:
2943:       B->ops->mult    = MatMult_SeqBAIJ_11;
2944:       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
2945:       break;
2946:     case 12:
2947:     {
2948:       PetscInt version = 1;
2949:       PetscOptionsGetInt(NULL,((PetscObject)B)->prefix,"-mat_baij_mult_version",&version,NULL);
2950:       switch (version) {
2951:       case 1:
2952:         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
2953:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
2954:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2955:         break;
2956:       case 2:
2957:         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
2958:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
2959:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2960:         break;
2961: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2962:       case 3:
2963:         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
2964:         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
2965:         PetscInfo1((PetscObject)B,"Using AVX2 for MatMult for BAIJ for blocksize %D\n",bs);
2966:         break;
2967: #endif
2968:       default:
2969:         B->ops->mult    = MatMult_SeqBAIJ_N;
2970:         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
2971:         PetscInfo1((PetscObject)B,"Using BLAS for MatMult for BAIJ for blocksize %D\n",bs);
2972:         break;
2973:       }
2974:       break;
2975:     }
2976:     case 15:
2977:     {
2978:       PetscInt version = 1;
2979:       PetscOptionsGetInt(NULL,((PetscObject)B)->prefix,"-mat_baij_mult_version",&version,NULL);
2980:       switch (version) {
2981:       case 1:
2982:         B->ops->mult    = MatMult_SeqBAIJ_15_ver1;
2983:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2984:         break;
2985:       case 2:
2986:         B->ops->mult    = MatMult_SeqBAIJ_15_ver2;
2987:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2988:         break;
2989:       case 3:
2990:         B->ops->mult    = MatMult_SeqBAIJ_15_ver3;
2991:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2992:         break;
2993:       case 4:
2994:         B->ops->mult    = MatMult_SeqBAIJ_15_ver4;
2995:         PetscInfo2((PetscObject)B,"Using version %D of MatMult for BAIJ for blocksize %D\n",version,bs);
2996:         break;
2997:       default:
2998:         B->ops->mult    = MatMult_SeqBAIJ_N;
2999:         PetscInfo1((PetscObject)B,"Using BLAS for MatMult for BAIJ for blocksize %D\n",bs);
3000:         break;
3001:       }
3002:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3003:       break;
3004:     }
3005:     default:
3006:       B->ops->mult    = MatMult_SeqBAIJ_N;
3007:       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3008:       PetscInfo1((PetscObject)B,"Using BLAS for MatMult for BAIJ for blocksize %D\n",bs);
3009:       break;
3010:     }
3011:   }
3012:   B->ops->sor = MatSOR_SeqBAIJ;
3013:   b->mbs = mbs;
3014:   b->nbs = nbs;
3015:   if (!skipallocation) {
3016:     if (!b->imax) {
3017:       PetscMalloc2(mbs,&b->imax,mbs,&b->ilen);
3018:       PetscLogObjectMemory((PetscObject)B,2*mbs*sizeof(PetscInt));

3020:       b->free_imax_ilen = PETSC_TRUE;
3021:     }
3022:     /* b->ilen will count nonzeros in each block row so far. */
3023:     for (i=0; i<mbs; i++) b->ilen[i] = 0;
3024:     if (!nnz) {
3025:       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3026:       else if (nz < 0) nz = 1;
3027:       nz = PetscMin(nz,nbs);
3028:       for (i=0; i<mbs; i++) b->imax[i] = nz;
3029:       PetscIntMultError(nz,mbs,&nz);
3030:     } else {
3031:       PetscInt64 nz64 = 0;
3032:       for (i=0; i<mbs; i++) {b->imax[i] = nnz[i]; nz64 += nnz[i];}
3033:       PetscIntCast(nz64,&nz);
3034:     }

3036:     /* allocate the matrix space */
3037:     MatSeqXAIJFreeAIJ(B,&b->a,&b->j,&b->i);
3038:     if (B->structure_only) {
3039:       PetscMalloc1(nz,&b->j);
3040:       PetscMalloc1(B->rmap->N+1,&b->i);
3041:       PetscLogObjectMemory((PetscObject)B,(B->rmap->N+1)*sizeof(PetscInt)+nz*sizeof(PetscInt));
3042:     } else {
3043:       PetscInt nzbs2 = 0;
3044:       PetscIntMultError(nz,bs2,&nzbs2);
3045:       PetscMalloc3(nzbs2,&b->a,nz,&b->j,B->rmap->N+1,&b->i);
3046:       PetscLogObjectMemory((PetscObject)B,(B->rmap->N+1)*sizeof(PetscInt)+nz*(bs2*sizeof(PetscScalar)+sizeof(PetscInt)));
3047:       PetscArrayzero(b->a,nz*bs2);
3048:     }
3049:     PetscArrayzero(b->j,nz);

3051:     if (B->structure_only) {
3052:       b->singlemalloc = PETSC_FALSE;
3053:       b->free_a       = PETSC_FALSE;
3054:     } else {
3055:       b->singlemalloc = PETSC_TRUE;
3056:       b->free_a       = PETSC_TRUE;
3057:     }
3058:     b->free_ij = PETSC_TRUE;

3060:     b->i[0] = 0;
3061:     for (i=1; i<mbs+1; i++) {
3062:       b->i[i] = b->i[i-1] + b->imax[i-1];
3063:     }

3065:   } else {
3066:     b->free_a  = PETSC_FALSE;
3067:     b->free_ij = PETSC_FALSE;
3068:   }

3070:   b->bs2              = bs2;
3071:   b->mbs              = mbs;
3072:   b->nz               = 0;
3073:   b->maxnz            = nz;
3074:   B->info.nz_unneeded = (PetscReal)b->maxnz*bs2;
3075:   B->was_assembled    = PETSC_FALSE;
3076:   B->assembled        = PETSC_FALSE;
3077:   if (realalloc) {MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);}
3078:   return(0);
3079: }

3081: PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B,PetscInt bs,const PetscInt ii[],const PetscInt jj[],const PetscScalar V[])
3082: {
3083:   PetscInt       i,m,nz,nz_max=0,*nnz;
3084:   PetscScalar    *values=NULL;
3085:   PetscBool      roworiented = ((Mat_SeqBAIJ*)B->data)->roworiented;

3089:   if (bs < 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Invalid block size specified, must be positive but it is %D",bs);
3090:   PetscLayoutSetBlockSize(B->rmap,bs);
3091:   PetscLayoutSetBlockSize(B->cmap,bs);
3092:   PetscLayoutSetUp(B->rmap);
3093:   PetscLayoutSetUp(B->cmap);
3094:   PetscLayoutGetBlockSize(B->rmap,&bs);
3095:   m    = B->rmap->n/bs;

3097:   if (ii[0] != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %D",ii[0]);
3098:   PetscMalloc1(m+1, &nnz);
3099:   for (i=0; i<m; i++) {
3100:     nz = ii[i+1]- ii[i];
3101:     if (nz < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE, "Local row %D has a negative number of columns %D",i,nz);
3102:     nz_max = PetscMax(nz_max, nz);
3103:     nnz[i] = nz;
3104:   }
3105:   MatSeqBAIJSetPreallocation(B,bs,0,nnz);
3106:   PetscFree(nnz);

3108:   values = (PetscScalar*)V;
3109:   if (!values) {
3110:     PetscCalloc1(bs*bs*(nz_max+1),&values);
3111:   }
3112:   for (i=0; i<m; i++) {
3113:     PetscInt          ncols  = ii[i+1] - ii[i];
3114:     const PetscInt    *icols = jj + ii[i];
3115:     if (bs == 1 || !roworiented) {
3116:       const PetscScalar *svals = values + (V ? (bs*bs*ii[i]) : 0);
3117:       MatSetValuesBlocked_SeqBAIJ(B,1,&i,ncols,icols,svals,INSERT_VALUES);
3118:     } else {
3119:       PetscInt j;
3120:       for (j=0; j<ncols; j++) {
3121:         const PetscScalar *svals = values + (V ? (bs*bs*(ii[i]+j)) : 0);
3122:         MatSetValuesBlocked_SeqBAIJ(B,1,&i,1,&icols[j],svals,INSERT_VALUES);
3123:       }
3124:     }
3125:   }
3126:   if (!V) { PetscFree(values); }
3127:   MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
3128:   MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
3129:   MatSetOption(B,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);
3130:   return(0);
3131: }

3133: /*@C
3134:    MatSeqBAIJGetArray - gives access to the array where the data for a MATSEQBAIJ matrix is stored

3136:    Not Collective

3138:    Input Parameter:
3139: .  mat - a MATSEQBAIJ matrix

3141:    Output Parameter:
3142: .   array - pointer to the data

3144:    Level: intermediate

3146: .seealso: MatSeqBAIJRestoreArray(), MatSeqAIJGetArray(), MatSeqAIJRestoreArray()
3147: @*/
3148: PetscErrorCode MatSeqBAIJGetArray(Mat A,PetscScalar **array)
3149: {

3153:   PetscUseMethod(A,"MatSeqBAIJGetArray_C",(Mat,PetscScalar**),(A,array));
3154:   return(0);
3155: }

3157: /*@C
3158:    MatSeqBAIJRestoreArray - returns access to the array where the data for a MATSEQBAIJ matrix is stored obtained by MatSeqBAIJGetArray()

3160:    Not Collective

3162:    Input Parameters:
3163: +  mat - a MATSEQBAIJ matrix
3164: -  array - pointer to the data

3166:    Level: intermediate

3168: .seealso: MatSeqBAIJGetArray(), MatSeqAIJGetArray(), MatSeqAIJRestoreArray()
3169: @*/
3170: PetscErrorCode MatSeqBAIJRestoreArray(Mat A,PetscScalar **array)
3171: {

3175:   PetscUseMethod(A,"MatSeqBAIJRestoreArray_C",(Mat,PetscScalar**),(A,array));
3176:   return(0);
3177: }

3179: /*MC
3180:    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3181:    block sparse compressed row format.

3183:    Options Database Keys:
3184: + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions()
3185: - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)

3187:    Level: beginner

3189:    Notes:
3190:     MatSetOptions(,MAT_STRUCTURE_ONLY,PETSC_TRUE) may be called for this matrix type. In this no
3191:     space is allocated for the nonzero entries and any entries passed with MatSetValues() are ignored

3193:    Run with -info to see what version of the matrix-vector product is being used

3195: .seealso: MatCreateSeqBAIJ()
3196: M*/

3198: PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType,MatReuse,Mat*);

3200: PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3201: {
3203:   PetscMPIInt    size;
3204:   Mat_SeqBAIJ    *b;

3207:   MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);
3208:   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Comm must be of size 1");

3210:   PetscNewLog(B,&b);
3211:   B->data = (void*)b;
3212:   PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));

3214:   b->row          = NULL;
3215:   b->col          = NULL;
3216:   b->icol         = NULL;
3217:   b->reallocs     = 0;
3218:   b->saved_values = NULL;

3220:   b->roworiented        = PETSC_TRUE;
3221:   b->nonew              = 0;
3222:   b->diag               = NULL;
3223:   B->spptr              = NULL;
3224:   B->info.nz_unneeded   = (PetscReal)b->maxnz*b->bs2;
3225:   b->keepnonzeropattern = PETSC_FALSE;

3227:   PetscObjectComposeFunction((PetscObject)B,"MatSeqBAIJGetArray_C",MatSeqBAIJGetArray_SeqBAIJ);
3228:   PetscObjectComposeFunction((PetscObject)B,"MatSeqBAIJRestoreArray_C",MatSeqBAIJRestoreArray_SeqBAIJ);
3229:   PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqBAIJ);
3230:   PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqBAIJ);
3231:   PetscObjectComposeFunction((PetscObject)B,"MatSeqBAIJSetColumnIndices_C",MatSeqBAIJSetColumnIndices_SeqBAIJ);
3232:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaij_seqaij_C",MatConvert_SeqBAIJ_SeqAIJ);
3233:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaij_seqsbaij_C",MatConvert_SeqBAIJ_SeqSBAIJ);
3234:   PetscObjectComposeFunction((PetscObject)B,"MatSeqBAIJSetPreallocation_C",MatSeqBAIJSetPreallocation_SeqBAIJ);
3235:   PetscObjectComposeFunction((PetscObject)B,"MatSeqBAIJSetPreallocationCSR_C",MatSeqBAIJSetPreallocationCSR_SeqBAIJ);
3236:   PetscObjectComposeFunction((PetscObject)B,"MatIsTranspose_C",MatIsTranspose_SeqBAIJ);
3237: #if defined(PETSC_HAVE_HYPRE)
3238:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaij_hypre_C",MatConvert_AIJ_HYPRE);
3239: #endif
3240:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqbaij_is_C",MatConvert_XAIJ_IS);
3241:   PetscObjectChangeTypeName((PetscObject)B,MATSEQBAIJ);
3242:   return(0);
3243: }

3245: PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
3246: {
3247:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data,*a = (Mat_SeqBAIJ*)A->data;
3249:   PetscInt       i,mbs = a->mbs,nz = a->nz,bs2 = a->bs2;

3252:   if (a->i[mbs] != nz) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupt matrix");

3254:   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3255:     c->imax           = a->imax;
3256:     c->ilen           = a->ilen;
3257:     c->free_imax_ilen = PETSC_FALSE;
3258:   } else {
3259:     PetscMalloc2(mbs,&c->imax,mbs,&c->ilen);
3260:     PetscLogObjectMemory((PetscObject)C,2*mbs*sizeof(PetscInt));
3261:     for (i=0; i<mbs; i++) {
3262:       c->imax[i] = a->imax[i];
3263:       c->ilen[i] = a->ilen[i];
3264:     }
3265:     c->free_imax_ilen = PETSC_TRUE;
3266:   }

3268:   /* allocate the matrix space */
3269:   if (mallocmatspace) {
3270:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3271:       PetscCalloc1(bs2*nz,&c->a);
3272:       PetscLogObjectMemory((PetscObject)C,a->i[mbs]*bs2*sizeof(PetscScalar));

3274:       c->i            = a->i;
3275:       c->j            = a->j;
3276:       c->singlemalloc = PETSC_FALSE;
3277:       c->free_a       = PETSC_TRUE;
3278:       c->free_ij      = PETSC_FALSE;
3279:       c->parent       = A;
3280:       C->preallocated = PETSC_TRUE;
3281:       C->assembled    = PETSC_TRUE;

3283:       PetscObjectReference((PetscObject)A);
3284:       MatSetOption(A,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);
3285:       MatSetOption(C,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE);
3286:     } else {
3287:       PetscMalloc3(bs2*nz,&c->a,nz,&c->j,mbs+1,&c->i);
3288:       PetscLogObjectMemory((PetscObject)C,a->i[mbs]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(mbs+1)*sizeof(PetscInt));

3290:       c->singlemalloc = PETSC_TRUE;
3291:       c->free_a       = PETSC_TRUE;
3292:       c->free_ij      = PETSC_TRUE;

3294:       PetscArraycpy(c->i,a->i,mbs+1);
3295:       if (mbs > 0) {
3296:         PetscArraycpy(c->j,a->j,nz);
3297:         if (cpvalues == MAT_COPY_VALUES) {
3298:           PetscArraycpy(c->a,a->a,bs2*nz);
3299:         } else {
3300:           PetscArrayzero(c->a,bs2*nz);
3301:         }
3302:       }
3303:       C->preallocated = PETSC_TRUE;
3304:       C->assembled    = PETSC_TRUE;
3305:     }
3306:   }

3308:   c->roworiented = a->roworiented;
3309:   c->nonew       = a->nonew;

3311:   PetscLayoutReference(A->rmap,&C->rmap);
3312:   PetscLayoutReference(A->cmap,&C->cmap);

3314:   c->bs2         = a->bs2;
3315:   c->mbs         = a->mbs;
3316:   c->nbs         = a->nbs;

3318:   if (a->diag) {
3319:     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3320:       c->diag      = a->diag;
3321:       c->free_diag = PETSC_FALSE;
3322:     } else {
3323:       PetscMalloc1(mbs+1,&c->diag);
3324:       PetscLogObjectMemory((PetscObject)C,(mbs+1)*sizeof(PetscInt));
3325:       for (i=0; i<mbs; i++) c->diag[i] = a->diag[i];
3326:       c->free_diag = PETSC_TRUE;
3327:     }
3328:   } else c->diag = NULL;

3330:   c->nz         = a->nz;
3331:   c->maxnz      = a->nz;         /* Since we allocate exactly the right amount */
3332:   c->solve_work = NULL;
3333:   c->mult_work  = NULL;
3334:   c->sor_workt  = NULL;
3335:   c->sor_work   = NULL;

3337:   c->compressedrow.use   = a->compressedrow.use;
3338:   c->compressedrow.nrows = a->compressedrow.nrows;
3339:   if (a->compressedrow.use) {
3340:     i    = a->compressedrow.nrows;
3341:     PetscMalloc2(i+1,&c->compressedrow.i,i+1,&c->compressedrow.rindex);
3342:     PetscLogObjectMemory((PetscObject)C,(2*i+1)*sizeof(PetscInt));
3343:     PetscArraycpy(c->compressedrow.i,a->compressedrow.i,i+1);
3344:     PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,i);
3345:   } else {
3346:     c->compressedrow.use    = PETSC_FALSE;
3347:     c->compressedrow.i      = NULL;
3348:     c->compressedrow.rindex = NULL;
3349:   }
3350:   C->nonzerostate = A->nonzerostate;

3352:   PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);
3353:   return(0);
3354: }

3356: PetscErrorCode MatDuplicate_SeqBAIJ(Mat A,MatDuplicateOption cpvalues,Mat *B)
3357: {

3361:   MatCreate(PetscObjectComm((PetscObject)A),B);
3362:   MatSetSizes(*B,A->rmap->N,A->cmap->n,A->rmap->N,A->cmap->n);
3363:   MatSetType(*B,MATSEQBAIJ);
3364:   MatDuplicateNoCreate_SeqBAIJ(*B,A,cpvalues,PETSC_TRUE);
3365:   return(0);
3366: }

3368: /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3369: PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat,PetscViewer viewer)
3370: {
3371:   PetscInt       header[4],M,N,nz,bs,m,n,mbs,nbs,rows,cols,sum,i,j,k;
3372:   PetscInt       *rowidxs,*colidxs;
3373:   PetscScalar    *matvals;

3377:   PetscViewerSetUp(viewer);

3379:   /* read matrix header */
3380:   PetscViewerBinaryRead(viewer,header,4,NULL,PETSC_INT);
3381:   if (header[0] != MAT_FILE_CLASSID) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED,"Not a matrix object in file");
3382:   M = header[1]; N = header[2]; nz = header[3];
3383:   if (M < 0) SETERRQ1(PetscObjectComm((PetscObject)viewer),PETSC_ERR_FILE_UNEXPECTED,"Matrix row size (%D) in file is negative",M);
3384:   if (N < 0) SETERRQ1(PetscObjectComm((PetscObject)viewer),PETSC_ERR_FILE_UNEXPECTED,"Matrix column size (%D) in file is negative",N);
3385:   if (nz < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED,"Matrix stored in special format on disk, cannot load as SeqBAIJ");

3387:   /* set block sizes from the viewer's .info file */
3388:   MatLoad_Binary_BlockSizes(mat,viewer);
3389:   /* set local and global sizes if not set already */
3390:   if (mat->rmap->n < 0) mat->rmap->n = M;
3391:   if (mat->cmap->n < 0) mat->cmap->n = N;
3392:   if (mat->rmap->N < 0) mat->rmap->N = M;
3393:   if (mat->cmap->N < 0) mat->cmap->N = N;
3394:   PetscLayoutSetUp(mat->rmap);
3395:   PetscLayoutSetUp(mat->cmap);

3397:   /* check if the matrix sizes are correct */
3398:   MatGetSize(mat,&rows,&cols);
3399:   if (M != rows || N != cols) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%D, %D) than the input matrix (%D, %D)",M,N,rows,cols);
3400:   MatGetBlockSize(mat,&bs);
3401:   MatGetLocalSize(mat,&m,&n);
3402:   mbs = m/bs; nbs = n/bs;

3404:   /* read in row lengths, column indices and nonzero values */
3405:   PetscMalloc1(m+1,&rowidxs);
3406:   PetscViewerBinaryRead(viewer,rowidxs+1,m,NULL,PETSC_INT);
3407:   rowidxs[0] = 0; for (i=0; i<m; i++) rowidxs[i+1] += rowidxs[i];
3408:   sum = rowidxs[m];
3409:   if (sum != nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED,"Inconsistent matrix data in file: nonzeros = %D, sum-row-lengths = %D\n",nz,sum);

3411:   /* read in column indices and nonzero values */
3412:   PetscMalloc2(rowidxs[m],&colidxs,nz,&matvals);
3413:   PetscViewerBinaryRead(viewer,colidxs,rowidxs[m],NULL,PETSC_INT);
3414:   PetscViewerBinaryRead(viewer,matvals,rowidxs[m],NULL,PETSC_SCALAR);

3416:   { /* preallocate matrix storage */
3417:     PetscBT   bt; /* helper bit set to count nonzeros */
3418:     PetscInt  *nnz;
3419:     PetscBool sbaij;

3421:     PetscBTCreate(nbs,&bt);
3422:     PetscCalloc1(mbs,&nnz);
3423:     PetscObjectTypeCompare((PetscObject)mat,MATSEQSBAIJ,&sbaij);
3424:     for (i=0; i<mbs; i++) {
3425:       PetscBTMemzero(nbs,bt);
3426:       for (k=0; k<bs; k++) {
3427:         PetscInt row = bs*i + k;
3428:         for (j=rowidxs[row]; j<rowidxs[row+1]; j++) {
3429:           PetscInt col = colidxs[j];
3430:           if (!sbaij || col >= row)
3431:             if (!PetscBTLookupSet(bt,col/bs)) nnz[i]++;
3432:         }
3433:       }
3434:     }
3435:     PetscBTDestroy(&bt);
3436:     MatSeqBAIJSetPreallocation(mat,bs,0,nnz);
3437:     MatSeqSBAIJSetPreallocation(mat,bs,0,nnz);
3438:     PetscFree(nnz);
3439:   }

3441:   /* store matrix values */
3442:   for (i=0; i<m; i++) {
3443:     PetscInt row = i, s = rowidxs[i], e = rowidxs[i+1];
3444:     (*mat->ops->setvalues)(mat,1,&row,e-s,colidxs+s,matvals+s,INSERT_VALUES);
3445:   }

3447:   PetscFree(rowidxs);
3448:   PetscFree2(colidxs,matvals);
3449:   MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY);
3450:   MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY);
3451:   return(0);
3452: }

3454: PetscErrorCode MatLoad_SeqBAIJ(Mat mat,PetscViewer viewer)
3455: {
3457:   PetscBool      isbinary;

3460:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);
3461:   if (!isbinary) SETERRQ2(PetscObjectComm((PetscObject)viewer),PETSC_ERR_SUP,"Viewer type %s not yet supported for reading %s matrices",((PetscObject)viewer)->type_name,((PetscObject)mat)->type_name);
3462:   MatLoad_SeqBAIJ_Binary(mat,viewer);
3463:   return(0);
3464: }

3466: /*@C
3467:    MatCreateSeqBAIJ - Creates a sparse matrix in block AIJ (block
3468:    compressed row) format.  For good matrix assembly performance the
3469:    user should preallocate the matrix storage by setting the parameter nz
3470:    (or the array nnz).  By setting these parameters accurately, performance
3471:    during matrix assembly can be increased by more than a factor of 50.

3473:    Collective

3475:    Input Parameters:
3476: +  comm - MPI communicator, set to PETSC_COMM_SELF
3477: .  bs - size of block, the blocks are ALWAYS square. One can use MatSetBlockSizes() to set a different row and column blocksize but the row
3478:           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with MatCreateVecs()
3479: .  m - number of rows
3480: .  n - number of columns
3481: .  nz - number of nonzero blocks  per block row (same for all rows)
3482: -  nnz - array containing the number of nonzero blocks in the various block rows
3483:          (possibly different for each block row) or NULL

3485:    Output Parameter:
3486: .  A - the matrix

3488:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3489:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
3490:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

3492:    Options Database Keys:
3493: +   -mat_no_unroll - uses code that does not unroll the loops in the
3494:                      block calculations (much slower)
3495: -    -mat_block_size - size of the blocks to use

3497:    Level: intermediate

3499:    Notes:
3500:    The number of rows and columns must be divisible by blocksize.

3502:    If the nnz parameter is given then the nz parameter is ignored

3504:    A nonzero block is any block that as 1 or more nonzeros in it

3506:    The block AIJ format is fully compatible with standard Fortran 77
3507:    storage.  That is, the stored row and column indices can begin at
3508:    either one (as in Fortran) or zero.  See the users' manual for details.

3510:    Specify the preallocated storage with either nz or nnz (not both).
3511:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3512:    allocation.  See Users-Manual: ch_mat for details.
3513:    matrices.

3515: .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues(), MatCreateBAIJ()
3516: @*/
3517: PetscErrorCode  MatCreateSeqBAIJ(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3518: {

3522:   MatCreate(comm,A);
3523:   MatSetSizes(*A,m,n,m,n);
3524:   MatSetType(*A,MATSEQBAIJ);
3525:   MatSeqBAIJSetPreallocation(*A,bs,nz,(PetscInt*)nnz);
3526:   return(0);
3527: }

3529: /*@C
3530:    MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3531:    per row in the matrix. For good matrix assembly performance the
3532:    user should preallocate the matrix storage by setting the parameter nz
3533:    (or the array nnz).  By setting these parameters accurately, performance
3534:    during matrix assembly can be increased by more than a factor of 50.

3536:    Collective

3538:    Input Parameters:
3539: +  B - the matrix
3540: .  bs - size of block, the blocks are ALWAYS square. One can use MatSetBlockSizes() to set a different row and column blocksize but the row
3541:           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with MatCreateVecs()
3542: .  nz - number of block nonzeros per block row (same for all rows)
3543: -  nnz - array containing the number of block nonzeros in the various block rows
3544:          (possibly different for each block row) or NULL

3546:    Options Database Keys:
3547: +   -mat_no_unroll - uses code that does not unroll the loops in the
3548:                      block calculations (much slower)
3549: -   -mat_block_size - size of the blocks to use

3551:    Level: intermediate

3553:    Notes:
3554:    If the nnz parameter is given then the nz parameter is ignored

3556:    You can call MatGetInfo() to get information on how effective the preallocation was;
3557:    for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3558:    You can also run with the option -info and look for messages with the string
3559:    malloc in them to see if additional memory allocation was needed.

3561:    The block AIJ format is fully compatible with standard Fortran 77
3562:    storage.  That is, the stored row and column indices can begin at
3563:    either one (as in Fortran) or zero.  See the users' manual for details.

3565:    Specify the preallocated storage with either nz or nnz (not both).
3566:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3567:    allocation.  See Users-Manual: ch_mat for details.

3569: .seealso: MatCreate(), MatCreateSeqAIJ(), MatSetValues(), MatCreateBAIJ(), MatGetInfo()
3570: @*/
3571: PetscErrorCode  MatSeqBAIJSetPreallocation(Mat B,PetscInt bs,PetscInt nz,const PetscInt nnz[])
3572: {

3579:   PetscTryMethod(B,"MatSeqBAIJSetPreallocation_C",(Mat,PetscInt,PetscInt,const PetscInt[]),(B,bs,nz,nnz));
3580:   return(0);
3581: }

3583: /*@C
3584:    MatSeqBAIJSetPreallocationCSR - Creates a sparse parallel matrix in BAIJ format using the given nonzero structure and (optional) numerical values

3586:    Collective

3588:    Input Parameters:
3589: +  B - the matrix
3590: .  i - the indices into j for the start of each local row (starts with zero)
3591: .  j - the column indices for each local row (starts with zero) these must be sorted for each row
3592: -  v - optional values in the matrix

3594:    Level: advanced

3596:    Notes:
3597:    The order of the entries in values is specified by the MatOption MAT_ROW_ORIENTED.  For example, C programs
3598:    may want to use the default MAT_ROW_ORIENTED=PETSC_TRUE and use an array v[nnz][bs][bs] where the second index is
3599:    over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3600:    MAT_ROW_ORIENTED=PETSC_FALSE and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3601:    block column and the second index is over columns within a block.

3603:    Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well

3605: .seealso: MatCreate(), MatCreateSeqBAIJ(), MatSetValues(), MatSeqBAIJSetPreallocation(), MATSEQBAIJ
3606: @*/
3607: PetscErrorCode  MatSeqBAIJSetPreallocationCSR(Mat B,PetscInt bs,const PetscInt i[],const PetscInt j[], const PetscScalar v[])
3608: {

3615:   PetscTryMethod(B,"MatSeqBAIJSetPreallocationCSR_C",(Mat,PetscInt,const PetscInt[],const PetscInt[],const PetscScalar[]),(B,bs,i,j,v));
3616:   return(0);
3617: }

3619: /*@
3620:      MatCreateSeqBAIJWithArrays - Creates an sequential BAIJ matrix using matrix elements provided by the user.

3622:      Collective

3624:    Input Parameters:
3625: +  comm - must be an MPI communicator of size 1
3626: .  bs - size of block
3627: .  m - number of rows
3628: .  n - number of columns
3629: .  i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3630: .  j - column indices
3631: -  a - matrix values

3633:    Output Parameter:
3634: .  mat - the matrix

3636:    Level: advanced

3638:    Notes:
3639:        The i, j, and a arrays are not copied by this routine, the user must free these arrays
3640:     once the matrix is destroyed

3642:        You cannot set new nonzero locations into this matrix, that will generate an error.

3644:        The i and j indices are 0 based

3646:        When block size is greater than 1 the matrix values must be stored using the BAIJ storage format (see the BAIJ code to determine this).

3648:       The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3649:       the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3650:       block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3651:       with column-major ordering within blocks.

3653: .seealso: MatCreate(), MatCreateBAIJ(), MatCreateSeqBAIJ()

3655: @*/
3656: PetscErrorCode  MatCreateSeqBAIJWithArrays(MPI_Comm comm,PetscInt bs,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],Mat *mat)
3657: {
3659:   PetscInt       ii;
3660:   Mat_SeqBAIJ    *baij;

3663:   if (bs != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size %D > 1 is not supported yet",bs);
3664:   if (m > 0 && i[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");

3666:   MatCreate(comm,mat);
3667:   MatSetSizes(*mat,m,n,m,n);
3668:   MatSetType(*mat,MATSEQBAIJ);
3669:   MatSeqBAIJSetPreallocation(*mat,bs,MAT_SKIP_ALLOCATION,NULL);
3670:   baij = (Mat_SeqBAIJ*)(*mat)->data;
3671:   PetscMalloc2(m,&baij->imax,m,&baij->ilen);
3672:   PetscLogObjectMemory((PetscObject)*mat,2*m*sizeof(PetscInt));

3674:   baij->i = i;
3675:   baij->j = j;
3676:   baij->a = a;

3678:   baij->singlemalloc = PETSC_FALSE;
3679:   baij->nonew        = -1;             /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3680:   baij->free_a       = PETSC_FALSE;
3681:   baij->free_ij      = PETSC_FALSE;

3683:   for (ii=0; ii<m; ii++) {
3684:     baij->ilen[ii] = baij->imax[ii] = i[ii+1] - i[ii];
3685:     if (PetscUnlikelyDebug(i[ii+1] - i[ii] < 0)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative row length in i (row indices) row = %d length = %d",ii,i[ii+1] - i[ii]);
3686:   }
3687:   if (PetscDefined(USE_DEBUG)) {
3688:     for (ii=0; ii<baij->i[m]; ii++) {
3689:       if (j[ii] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative column index at location = %d index = %d",ii,j[ii]);
3690:       if (j[ii] > n - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column index to large at location = %d index = %d",ii,j[ii]);
3691:     }
3692:   }

3694:   MatAssemblyBegin(*mat,MAT_FINAL_ASSEMBLY);
3695:   MatAssemblyEnd(*mat,MAT_FINAL_ASSEMBLY);
3696:   return(0);
3697: }

3699: PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm,Mat inmat,PetscInt n,MatReuse scall,Mat *outmat)
3700: {
3702:   PetscMPIInt    size;

3705:   MPI_Comm_size(comm,&size);
3706:   if (size == 1 && scall == MAT_REUSE_MATRIX) {
3707:     MatCopy(inmat,*outmat,SAME_NONZERO_PATTERN);
3708:   } else {
3709:     MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm,inmat,n,scall,outmat);
3710:   }
3711:   return(0);
3712: }