Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 13:   IS                isrow=a->row;
 14:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 15:   PetscErrorCode    ierr;
 16:   const PetscInt    *r;
 17:   PetscInt          nz,*vj,k,idx,k1;
 18:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 19:   const MatScalar   *aa=a->a,*v,*diag;
 20:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 21:   const PetscScalar *b;

 24:   VecGetArrayRead(bb,&b);
 25:   VecGetArray(xx,&x);
 26:   t    = a->solve_work;
 27:   ISGetIndices(isrow,&r);
 28:   PetscMalloc1(bs,&xk_tmp);

 30:   /* solve U^T * D * y = b by forward substitution */
 31:   xk = t;
 32:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 33:     idx = bs*r[k];
 34:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 35:   }
 36:   for (k=0; k<mbs; k++) {
 37:     v    = aa + bs2*ai[k];
 38:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 39:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 40:     nz   = ai[k+1] - ai[k];
 41:     vj   = aj + ai[k];
 42:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 43:     while (nz--) {
 44:       /* x(:) += U(k,:)^T*(Dk*xk) */
 45:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 46:       vj++; xj = t + (*vj)*bs;
 47:       v       += bs2;
 48:     }
 49:     /* xk = inv(Dk)*(Dk*xk) */
 50:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 51:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 52:   }

 54:   /* solve U*x = y by back substitution */
 55:   for (k=mbs-1; k>=0; k--) {
 56:     v  = aa + bs2*ai[k];
 57:     xk = t + k*bs;        /* xk */
 58:     nz = ai[k+1] - ai[k];
 59:     vj = aj + ai[k];
 60:     xj = t + (*vj)*bs;
 61:     while (nz--) {
 62:       /* xk += U(k,:)*x(:) */
 63:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 64:       vj++;
 65:       v += bs2; xj = t + (*vj)*bs;
 66:     }
 67:     idx = bs*r[k];
 68:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 69:   }

 71:   PetscFree(xk_tmp);
 72:   ISRestoreIndices(isrow,&r);
 73:   VecRestoreArrayRead(bb,&b);
 74:   VecRestoreArray(xx,&x);
 75:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 76:   return(0);
 77: }

 79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 80: {
 82:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
 83: }

 85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 86: {
 88:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
 89: }

 91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
 92: {
 93:   PetscErrorCode  ierr;
 94:   PetscInt        nz,k;
 95:   const PetscInt  *vj,bs2 = bs*bs;
 96:   const MatScalar *v,*diag;
 97:   PetscScalar     *xk,*xj,*xk_tmp;

100:   PetscMalloc1(bs,&xk_tmp);
101:   for (k=0; k<mbs; k++) {
102:     v    = aa + bs2*ai[k];
103:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
104:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105:     nz   = ai[k+1] - ai[k];
106:     vj   = aj + ai[k];
107:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108:     while (nz--) {
109:       /* x(:) += U(k,:)^T*(Dk*xk) */
110:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111:       vj++; xj = x + (*vj)*bs;
112:       v       += bs2;
113:     }
114:     /* xk = inv(Dk)*(Dk*xk) */
115:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
116:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117:   }
118:   PetscFree(xk_tmp);
119:   return(0);
120: }

122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124:   PetscInt        nz,k;
125:   const PetscInt  *vj,bs2 = bs*bs;
126:   const MatScalar *v;
127:   PetscScalar     *xk,*xj;

130:   for (k=mbs-1; k>=0; k--) {
131:     v  = aa + bs2*ai[k];
132:     xk = x + k*bs;        /* xk */
133:     nz = ai[k+1] - ai[k];
134:     vj = aj + ai[k];
135:     xj = x + (*vj)*bs;
136:     while (nz--) {
137:       /* xk += U(k,:)*x(:) */
138:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139:       vj++;
140:       v += bs2; xj = x + (*vj)*bs;
141:     }
142:   }
143:   return(0);
144: }

146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
149:   PetscErrorCode    ierr;
150:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
151:   PetscInt          bs =A->rmap->bs;
152:   const MatScalar   *aa=a->a;
153:   PetscScalar       *x;
154:   const PetscScalar *b;

157:   VecGetArrayRead(bb,&b);
158:   VecGetArray(xx,&x);

160:   /* solve U^T * D * y = b by forward substitution */
161:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
162:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

164:   /* solve U*x = y by back substitution */
165:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

167:   VecRestoreArrayRead(bb,&b);
168:   VecRestoreArray(xx,&x);
169:   PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170:   return(0);
171: }

173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
176:   PetscErrorCode    ierr;
177:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
178:   PetscInt          bs =A->rmap->bs;
179:   const MatScalar   *aa=a->a;
180:   const PetscScalar *b;
181:   PetscScalar       *x;

184:   VecGetArrayRead(bb,&b);
185:   VecGetArray(xx,&x);
186:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
187:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188:   VecRestoreArrayRead(bb,&b);
189:   VecRestoreArray(xx,&x);
190:   PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191:   return(0);
192: }

194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
197:   PetscErrorCode    ierr;
198:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
199:   PetscInt          bs =A->rmap->bs;
200:   const MatScalar   *aa=a->a;
201:   const PetscScalar *b;
202:   PetscScalar       *x;

205:   VecGetArrayRead(bb,&b);
206:   VecGetArray(xx,&x);
207:   PetscArraycpy(x,b,bs*mbs);
208:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209:   VecRestoreArrayRead(bb,&b);
210:   VecRestoreArray(xx,&x);
211:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212:   return(0);
213: }

215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
218:   IS                isrow=a->row;
219:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220:   PetscErrorCode    ierr;
221:   PetscInt          nz,k,idx;
222:   const MatScalar   *aa=a->a,*v,*d;
223:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224:   const PetscScalar *b;

227:   VecGetArrayRead(bb,&b);
228:   VecGetArray(xx,&x);
229:   t    = a->solve_work;
230:   ISGetIndices(isrow,&r);

232:   /* solve U^T * D * y = b by forward substitution */
233:   tp = t;
234:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
235:     idx   = 7*r[k];
236:     tp[0] = b[idx];
237:     tp[1] = b[idx+1];
238:     tp[2] = b[idx+2];
239:     tp[3] = b[idx+3];
240:     tp[4] = b[idx+4];
241:     tp[5] = b[idx+5];
242:     tp[6] = b[idx+6];
243:     tp   += 7;
244:   }

246:   for (k=0; k<mbs; k++) {
247:     v  = aa + 49*ai[k];
248:     vj = aj + ai[k];
249:     tp = t + k*7;
250:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251:     nz = ai[k+1] - ai[k];
252:     tp = t + (*vj)*7;
253:     while (nz--) {
254:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261:       vj++;
262:       tp = t + (*vj)*7;
263:       v += 49;
264:     }

266:     /* xk = inv(Dk)*(Dk*xk) */
267:     d     = aa+k*49;       /* ptr to inv(Dk) */
268:     tp    = t + k*7;
269:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276:   }

278:   /* solve U*x = y by back substitution */
279:   for (k=mbs-1; k>=0; k--) {
280:     v  = aa + 49*ai[k];
281:     vj = aj + ai[k];
282:     tp = t + k*7;
283:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
284:     nz = ai[k+1] - ai[k];

286:     tp = t + (*vj)*7;
287:     while (nz--) {
288:       /* xk += U(k,:)*x(:) */
289:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296:       vj++;
297:       tp = t + (*vj)*7;
298:       v += 49;
299:     }
300:     tp       = t + k*7;
301:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302:     idx      = 7*r[k];
303:     x[idx]   = x0;
304:     x[idx+1] = x1;
305:     x[idx+2] = x2;
306:     x[idx+3] = x3;
307:     x[idx+4] = x4;
308:     x[idx+5] = x5;
309:     x[idx+6] = x6;
310:   }

312:   ISRestoreIndices(isrow,&r);
313:   VecRestoreArrayRead(bb,&b);
314:   VecRestoreArray(xx,&x);
315:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316:   return(0);
317: }

319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321:   const MatScalar *v,*d;
322:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
323:   PetscInt        nz,k;
324:   const PetscInt  *vj;

327:   for (k=0; k<mbs; k++) {
328:     v  = aa + 49*ai[k];
329:     xp = x + k*7;
330:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331:     nz = ai[k+1] - ai[k];
332:     vj = aj + ai[k];
333:     xp = x + (*vj)*7;
334:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
335:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
336:     while (nz--) {
337:       /* x(:) += U(k,:)^T*(Dk*xk) */
338:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345:       vj++;
346:       xp = x + (*vj)*7;
347:       v += 49;
348:     }
349:     /* xk = inv(Dk)*(Dk*xk) */
350:     d     = aa+k*49;       /* ptr to inv(Dk) */
351:     xp    = x + k*7;
352:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
353:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
354:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
355:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
356:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
357:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
358:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
359:   }
360:   return(0);
361: }

363: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
364: {
365:   const MatScalar *v;
366:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
367:   PetscInt        nz,k;
368:   const PetscInt  *vj;

371:   for (k=mbs-1; k>=0; k--) {
372:     v  = aa + 49*ai[k];
373:     xp = x + k*7;
374:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
375:     nz = ai[k+1] - ai[k];
376:     vj = aj + ai[k];
377:     xp = x + (*vj)*7;
378:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
379:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380:     while (nz--) {
381:       /* xk += U(k,:)*x(:) */
382:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
383:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
384:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
385:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
386:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
387:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
388:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
389:       vj++;
390:       v += 49; xp = x + (*vj)*7;
391:     }
392:     xp   = x + k*7;
393:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
394:   }
395:   return(0);
396: }

398: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
399: {
400:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
401:   PetscErrorCode    ierr;
402:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
403:   const MatScalar   *aa=a->a;
404:   PetscScalar       *x;
405:   const PetscScalar *b;

408:   VecGetArrayRead(bb,&b);
409:   VecGetArray(xx,&x);

411:   /* solve U^T * D * y = b by forward substitution */
412:   PetscArraycpy(x,b,7*mbs); /* x <- b */
413:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

415:   /* solve U*x = y by back substitution */
416:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

418:   VecRestoreArrayRead(bb,&b);
419:   VecRestoreArray(xx,&x);
420:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
421:   return(0);
422: }

424: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425: {
426:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
427:   PetscErrorCode    ierr;
428:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
429:   const MatScalar   *aa=a->a;
430:   PetscScalar       *x;
431:   const PetscScalar *b;

434:   VecGetArrayRead(bb,&b);
435:   VecGetArray(xx,&x);
436:   PetscArraycpy(x,b,7*mbs);
437:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
438:   VecRestoreArrayRead(bb,&b);
439:   VecRestoreArray(xx,&x);
440:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
441:   return(0);
442: }

444: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
445: {
446:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
447:   PetscErrorCode    ierr;
448:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
449:   const MatScalar   *aa=a->a;
450:   PetscScalar       *x;
451:   const PetscScalar *b;

454:   VecGetArrayRead(bb,&b);
455:   VecGetArray(xx,&x);
456:   PetscArraycpy(x,b,7*mbs);
457:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
458:   VecRestoreArrayRead(bb,&b);
459:   VecRestoreArray(xx,&x);
460:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
461:   return(0);
462: }

464: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
465: {
466:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
467:   IS                isrow=a->row;
468:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
469:   PetscErrorCode    ierr;
470:   PetscInt          nz,k,idx;
471:   const MatScalar   *aa=a->a,*v,*d;
472:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
473:   const PetscScalar *b;

476:   VecGetArrayRead(bb,&b);
477:   VecGetArray(xx,&x);
478:   t    = a->solve_work;
479:   ISGetIndices(isrow,&r);

481:   /* solve U^T * D * y = b by forward substitution */
482:   tp = t;
483:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
484:     idx   = 6*r[k];
485:     tp[0] = b[idx];
486:     tp[1] = b[idx+1];
487:     tp[2] = b[idx+2];
488:     tp[3] = b[idx+3];
489:     tp[4] = b[idx+4];
490:     tp[5] = b[idx+5];
491:     tp   += 6;
492:   }

494:   for (k=0; k<mbs; k++) {
495:     v  = aa + 36*ai[k];
496:     vj = aj + ai[k];
497:     tp = t + k*6;
498:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
499:     nz = ai[k+1] - ai[k];
500:     tp = t + (*vj)*6;
501:     while (nz--) {
502:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
503:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
504:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
505:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
506:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
507:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
508:       vj++;
509:       tp = t + (*vj)*6;
510:       v += 36;
511:     }

513:     /* xk = inv(Dk)*(Dk*xk) */
514:     d     = aa+k*36;       /* ptr to inv(Dk) */
515:     tp    = t + k*6;
516:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
517:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
518:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
519:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
520:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
521:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
522:   }

524:   /* solve U*x = y by back substitution */
525:   for (k=mbs-1; k>=0; k--) {
526:     v  = aa + 36*ai[k];
527:     vj = aj + ai[k];
528:     tp = t + k*6;
529:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
530:     nz = ai[k+1] - ai[k];

532:     tp = t + (*vj)*6;
533:     while (nz--) {
534:       /* xk += U(k,:)*x(:) */
535:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
536:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
537:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
538:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
539:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
540:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
541:       vj++;
542:       tp = t + (*vj)*6;
543:       v += 36;
544:     }
545:     tp       = t + k*6;
546:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
547:     idx      = 6*r[k];
548:     x[idx]   = x0;
549:     x[idx+1] = x1;
550:     x[idx+2] = x2;
551:     x[idx+3] = x3;
552:     x[idx+4] = x4;
553:     x[idx+5] = x5;
554:   }

556:   ISRestoreIndices(isrow,&r);
557:   VecRestoreArrayRead(bb,&b);
558:   VecRestoreArray(xx,&x);
559:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
560:   return(0);
561: }

563: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
564: {
565:   const MatScalar *v,*d;
566:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
567:   PetscInt        nz,k;
568:   const PetscInt  *vj;

571:   for (k=0; k<mbs; k++) {
572:     v  = aa + 36*ai[k];
573:     xp = x + k*6;
574:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
575:     nz = ai[k+1] - ai[k];
576:     vj = aj + ai[k];
577:     xp = x + (*vj)*6;
578:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
579:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580:     while (nz--) {
581:       /* x(:) += U(k,:)^T*(Dk*xk) */
582:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
583:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
584:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
585:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
586:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
587:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
588:       vj++;
589:       xp = x + (*vj)*6;
590:       v += 36;
591:     }
592:     /* xk = inv(Dk)*(Dk*xk) */
593:     d     = aa+k*36;       /* ptr to inv(Dk) */
594:     xp    = x + k*6;
595:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
596:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
597:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
598:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
599:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
600:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
601:   }
602:   return(0);
603: }
604: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
605: {
606:   const MatScalar   *v;
607:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
608:   PetscInt          nz,k;
609:   const PetscInt    *vj;

612:   for (k=mbs-1; k>=0; k--) {
613:     v  = aa + 36*ai[k];
614:     xp = x + k*6;
615:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
616:     nz = ai[k+1] - ai[k];
617:     vj = aj + ai[k];
618:     xp = x + (*vj)*6;
619:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
620:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
621:     while (nz--) {
622:       /* xk += U(k,:)*x(:) */
623:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
624:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
625:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
626:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
627:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
628:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
629:       vj++;
630:       v += 36; xp = x + (*vj)*6;
631:     }
632:     xp   = x + k*6;
633:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
634:   }
635:   return(0);
636: }


639: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
640: {
641:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
642:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
643:   const MatScalar   *aa=a->a;
644:   PetscScalar       *x;
645:   const PetscScalar *b;
646:   PetscErrorCode    ierr;

649:   VecGetArrayRead(bb,&b);
650:   VecGetArray(xx,&x);

652:   /* solve U^T * D * y = b by forward substitution */
653:   PetscArraycpy(x,b,6*mbs); /* x <- b */
654:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

656:   /* solve U*x = y by back substitution */
657:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

659:   VecRestoreArrayRead(bb,&b);
660:   VecRestoreArray(xx,&x);
661:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
662:   return(0);
663: }

665: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
666: {
667:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
668:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
669:   const MatScalar   *aa=a->a;
670:   PetscScalar       *x;
671:   const PetscScalar *b;
672:   PetscErrorCode    ierr;

675:   VecGetArrayRead(bb,&b);
676:   VecGetArray(xx,&x);
677:   PetscArraycpy(x,b,6*mbs); /* x <- b */
678:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
679:   VecRestoreArrayRead(bb,&b);
680:   VecRestoreArray(xx,&x);
681:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
682:   return(0);
683: }

685: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
686: {
687:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
688:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
689:   const MatScalar   *aa=a->a;
690:   PetscScalar       *x;
691:   const PetscScalar *b;
692:   PetscErrorCode    ierr;

695:   VecGetArrayRead(bb,&b);
696:   VecGetArray(xx,&x);
697:   PetscArraycpy(x,b,6*mbs); /* x <- b */
698:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
699:   VecRestoreArrayRead(bb,&b);
700:   VecRestoreArray(xx,&x);
701:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
702:   return(0);
703: }

705: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
706: {
707:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
708:   IS                isrow=a->row;
709:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
710:   PetscErrorCode    ierr;
711:   const PetscInt    *r,*vj;
712:   PetscInt          nz,k,idx;
713:   const MatScalar   *aa=a->a,*v,*diag;
714:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
715:   const PetscScalar *b;

718:   VecGetArrayRead(bb,&b);
719:   VecGetArray(xx,&x);
720:   t    = a->solve_work;
721:   ISGetIndices(isrow,&r);

723:   /* solve U^T * D * y = b by forward substitution */
724:   tp = t;
725:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
726:     idx   = 5*r[k];
727:     tp[0] = b[idx];
728:     tp[1] = b[idx+1];
729:     tp[2] = b[idx+2];
730:     tp[3] = b[idx+3];
731:     tp[4] = b[idx+4];
732:     tp   += 5;
733:   }

735:   for (k=0; k<mbs; k++) {
736:     v  = aa + 25*ai[k];
737:     vj = aj + ai[k];
738:     tp = t + k*5;
739:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
740:     nz = ai[k+1] - ai[k];

742:     tp = t + (*vj)*5;
743:     while (nz--) {
744:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
745:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
746:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
747:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
748:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
749:       vj++;
750:       tp = t + (*vj)*5;
751:       v += 25;
752:     }

754:     /* xk = inv(Dk)*(Dk*xk) */
755:     diag  = aa+k*25;          /* ptr to inv(Dk) */
756:     tp    = t + k*5;
757:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
758:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
759:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
760:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
761:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
762:   }

764:   /* solve U*x = y by back substitution */
765:   for (k=mbs-1; k>=0; k--) {
766:     v  = aa + 25*ai[k];
767:     vj = aj + ai[k];
768:     tp = t + k*5;
769:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
770:     nz = ai[k+1] - ai[k];

772:     tp = t + (*vj)*5;
773:     while (nz--) {
774:       /* xk += U(k,:)*x(:) */
775:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
776:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
777:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
778:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
779:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
780:       vj++;
781:       tp = t + (*vj)*5;
782:       v += 25;
783:     }
784:     tp       = t + k*5;
785:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
786:     idx      = 5*r[k];
787:     x[idx]   = x0;
788:     x[idx+1] = x1;
789:     x[idx+2] = x2;
790:     x[idx+3] = x3;
791:     x[idx+4] = x4;
792:   }

794:   ISRestoreIndices(isrow,&r);
795:   VecRestoreArrayRead(bb,&b);
796:   VecRestoreArray(xx,&x);
797:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
798:   return(0);
799: }

801: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
802: {
803:   const MatScalar *v,*diag;
804:   PetscScalar     *xp,x0,x1,x2,x3,x4;
805:   PetscInt        nz,k;
806:   const PetscInt  *vj;

809:   for (k=0; k<mbs; k++) {
810:     v  = aa + 25*ai[k];
811:     xp = x + k*5;
812:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
813:     nz = ai[k+1] - ai[k];
814:     vj = aj + ai[k];
815:     xp = x + (*vj)*5;
816:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
817:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
818:     while (nz--) {
819:       /* x(:) += U(k,:)^T*(Dk*xk) */
820:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
821:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
822:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
823:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
824:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
825:       vj++;
826:       xp = x + (*vj)*5;
827:       v += 25;
828:     }
829:     /* xk = inv(Dk)*(Dk*xk) */
830:     diag  = aa+k*25;         /* ptr to inv(Dk) */
831:     xp    = x + k*5;
832:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
833:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
834:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
835:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
836:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
837:   }
838:   return(0);
839: }

841: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
842: {
843:   const MatScalar *v;
844:   PetscScalar     *xp,x0,x1,x2,x3,x4;
845:   PetscInt        nz,k;
846:   const PetscInt  *vj;

849:   for (k=mbs-1; k>=0; k--) {
850:     v  = aa + 25*ai[k];
851:     xp = x + k*5;
852:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
853:     nz = ai[k+1] - ai[k];
854:     vj = aj + ai[k];
855:     xp = x + (*vj)*5;
856:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
857:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
858:     while (nz--) {
859:       /* xk += U(k,:)*x(:) */
860:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
861:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
862:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
863:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
864:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
865:       vj++;
866:       v += 25; xp = x + (*vj)*5;
867:     }
868:     xp   = x + k*5;
869:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
870:   }
871:   return(0);
872: }

874: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
875: {
876:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
877:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
878:   const MatScalar   *aa=a->a;
879:   PetscScalar       *x;
880:   const PetscScalar *b;
881:   PetscErrorCode    ierr;

884:   VecGetArrayRead(bb,&b);
885:   VecGetArray(xx,&x);

887:   /* solve U^T * D * y = b by forward substitution */
888:   PetscArraycpy(x,b,5*mbs); /* x <- b */
889:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

891:   /* solve U*x = y by back substitution */
892:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

894:   VecRestoreArrayRead(bb,&b);
895:   VecRestoreArray(xx,&x);
896:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
897:   return(0);
898: }

900: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
901: {
902:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
903:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
904:   const MatScalar   *aa=a->a;
905:   PetscScalar       *x;
906:   const PetscScalar *b;
907:   PetscErrorCode    ierr;

910:   VecGetArrayRead(bb,&b);
911:   VecGetArray(xx,&x);
912:   PetscArraycpy(x,b,5*mbs); /* x <- b */
913:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
914:   VecRestoreArrayRead(bb,&b);
915:   VecRestoreArray(xx,&x);
916:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
917:   return(0);
918: }

920: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
921: {
922:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
923:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
924:   const MatScalar   *aa=a->a;
925:   PetscScalar       *x;
926:   const PetscScalar *b;
927:   PetscErrorCode    ierr;

930:   VecGetArrayRead(bb,&b);
931:   VecGetArray(xx,&x);
932:   PetscArraycpy(x,b,5*mbs);
933:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
934:   VecRestoreArrayRead(bb,&b);
935:   VecRestoreArray(xx,&x);
936:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
937:   return(0);
938: }

940: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
941: {
942:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
943:   IS                isrow=a->row;
944:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
945:   PetscErrorCode    ierr;
946:   const PetscInt    *r,*vj;
947:   PetscInt          nz,k,idx;
948:   const MatScalar   *aa=a->a,*v,*diag;
949:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
950:   const PetscScalar *b;

953:   VecGetArrayRead(bb,&b);
954:   VecGetArray(xx,&x);
955:   t    = a->solve_work;
956:   ISGetIndices(isrow,&r);

958:   /* solve U^T * D * y = b by forward substitution */
959:   tp = t;
960:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
961:     idx   = 4*r[k];
962:     tp[0] = b[idx];
963:     tp[1] = b[idx+1];
964:     tp[2] = b[idx+2];
965:     tp[3] = b[idx+3];
966:     tp   += 4;
967:   }

969:   for (k=0; k<mbs; k++) {
970:     v  = aa + 16*ai[k];
971:     vj = aj + ai[k];
972:     tp = t + k*4;
973:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
974:     nz = ai[k+1] - ai[k];

976:     tp = t + (*vj)*4;
977:     while (nz--) {
978:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
979:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
980:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
981:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
982:       vj++;
983:       tp = t + (*vj)*4;
984:       v += 16;
985:     }

987:     /* xk = inv(Dk)*(Dk*xk) */
988:     diag  = aa+k*16;          /* ptr to inv(Dk) */
989:     tp    = t + k*4;
990:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
991:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
992:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
993:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
994:   }

996:   /* solve U*x = y by back substitution */
997:   for (k=mbs-1; k>=0; k--) {
998:     v  = aa + 16*ai[k];
999:     vj = aj + ai[k];
1000:     tp = t + k*4;
1001:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1002:     nz = ai[k+1] - ai[k];

1004:     tp = t + (*vj)*4;
1005:     while (nz--) {
1006:       /* xk += U(k,:)*x(:) */
1007:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1008:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1009:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1010:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1011:       vj++;
1012:       tp = t + (*vj)*4;
1013:       v += 16;
1014:     }
1015:     tp       = t + k*4;
1016:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1017:     idx      = 4*r[k];
1018:     x[idx]   = x0;
1019:     x[idx+1] = x1;
1020:     x[idx+2] = x2;
1021:     x[idx+3] = x3;
1022:   }

1024:   ISRestoreIndices(isrow,&r);
1025:   VecRestoreArrayRead(bb,&b);
1026:   VecRestoreArray(xx,&x);
1027:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1028:   return(0);
1029: }

1031: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1032: {
1033:   const MatScalar *v,*diag;
1034:   PetscScalar     *xp,x0,x1,x2,x3;
1035:   PetscInt        nz,k;
1036:   const PetscInt  *vj;

1039:   for (k=0; k<mbs; k++) {
1040:     v  = aa + 16*ai[k];
1041:     xp = x + k*4;
1042:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1043:     nz = ai[k+1] - ai[k];
1044:     vj = aj + ai[k];
1045:     xp = x + (*vj)*4;
1046:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1047:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1048:     while (nz--) {
1049:       /* x(:) += U(k,:)^T*(Dk*xk) */
1050:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1051:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1052:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1053:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1054:       vj++;
1055:       xp = x + (*vj)*4;
1056:       v += 16;
1057:     }
1058:     /* xk = inv(Dk)*(Dk*xk) */
1059:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1060:     xp    = x + k*4;
1061:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1062:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1063:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1064:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1065:   }
1066:   return(0);
1067: }

1069: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1070: {
1071:   const MatScalar *v;
1072:   PetscScalar     *xp,x0,x1,x2,x3;
1073:   PetscInt        nz,k;
1074:   const PetscInt  *vj;

1077:   for (k=mbs-1; k>=0; k--) {
1078:     v  = aa + 16*ai[k];
1079:     xp = x + k*4;
1080:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1081:     nz = ai[k+1] - ai[k];
1082:     vj = aj + ai[k];
1083:     xp = x + (*vj)*4;
1084:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1085:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1086:     while (nz--) {
1087:       /* xk += U(k,:)*x(:) */
1088:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1089:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1090:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1091:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1092:       vj++;
1093:       v += 16; xp = x + (*vj)*4;
1094:     }
1095:     xp    = x + k*4;
1096:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1097:   }
1098:   return(0);
1099: }

1101: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1102: {
1103:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1104:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1105:   const MatScalar   *aa=a->a;
1106:   PetscScalar       *x;
1107:   const PetscScalar *b;
1108:   PetscErrorCode    ierr;

1111:   VecGetArrayRead(bb,&b);
1112:   VecGetArray(xx,&x);

1114:   /* solve U^T * D * y = b by forward substitution */
1115:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1116:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1118:   /* solve U*x = y by back substitution */
1119:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1120:   VecRestoreArrayRead(bb,&b);
1121:   VecRestoreArray(xx,&x);
1122:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1123:   return(0);
1124: }

1126: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1127: {
1128:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1129:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1130:   const MatScalar   *aa=a->a;
1131:   PetscScalar       *x;
1132:   const PetscScalar *b;
1133:   PetscErrorCode    ierr;

1136:   VecGetArrayRead(bb,&b);
1137:   VecGetArray(xx,&x);
1138:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1139:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1140:   VecRestoreArrayRead(bb,&b);
1141:   VecRestoreArray(xx,&x);
1142:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1143:   return(0);
1144: }

1146: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1147: {
1148:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1149:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1150:   const MatScalar   *aa=a->a;
1151:   PetscScalar       *x;
1152:   const PetscScalar *b;
1153:   PetscErrorCode    ierr;

1156:   VecGetArrayRead(bb,&b);
1157:   VecGetArray(xx,&x);
1158:   PetscArraycpy(x,b,4*mbs);
1159:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1160:   VecRestoreArrayRead(bb,&b);
1161:   VecRestoreArray(xx,&x);
1162:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1163:   return(0);
1164: }

1166: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1167: {
1168:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1169:   IS                isrow=a->row;
1170:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1171:   PetscErrorCode    ierr;
1172:   const PetscInt    *r;
1173:   PetscInt          nz,k,idx;
1174:   const PetscInt    *vj;
1175:   const MatScalar   *aa=a->a,*v,*diag;
1176:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1177:   const PetscScalar *b;

1180:   VecGetArrayRead(bb,&b);
1181:   VecGetArray(xx,&x);
1182:   t    = a->solve_work;
1183:   ISGetIndices(isrow,&r);

1185:   /* solve U^T * D * y = b by forward substitution */
1186:   tp = t;
1187:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1188:     idx   = 3*r[k];
1189:     tp[0] = b[idx];
1190:     tp[1] = b[idx+1];
1191:     tp[2] = b[idx+2];
1192:     tp   += 3;
1193:   }

1195:   for (k=0; k<mbs; k++) {
1196:     v  = aa + 9*ai[k];
1197:     vj = aj + ai[k];
1198:     tp = t + k*3;
1199:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1200:     nz = ai[k+1] - ai[k];

1202:     tp = t + (*vj)*3;
1203:     while (nz--) {
1204:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1205:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1206:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1207:       vj++;
1208:       tp = t + (*vj)*3;
1209:       v += 9;
1210:     }

1212:     /* xk = inv(Dk)*(Dk*xk) */
1213:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1214:     tp    = t + k*3;
1215:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1216:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1217:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1218:   }

1220:   /* solve U*x = y by back substitution */
1221:   for (k=mbs-1; k>=0; k--) {
1222:     v  = aa + 9*ai[k];
1223:     vj = aj + ai[k];
1224:     tp = t + k*3;
1225:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1226:     nz = ai[k+1] - ai[k];

1228:     tp = t + (*vj)*3;
1229:     while (nz--) {
1230:       /* xk += U(k,:)*x(:) */
1231:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1232:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1233:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1234:       vj++;
1235:       tp = t + (*vj)*3;
1236:       v += 9;
1237:     }
1238:     tp       = t + k*3;
1239:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1240:     idx      = 3*r[k];
1241:     x[idx]   = x0;
1242:     x[idx+1] = x1;
1243:     x[idx+2] = x2;
1244:   }

1246:   ISRestoreIndices(isrow,&r);
1247:   VecRestoreArrayRead(bb,&b);
1248:   VecRestoreArray(xx,&x);
1249:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1250:   return(0);
1251: }

1253: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1254: {
1255:   const MatScalar *v,*diag;
1256:   PetscScalar     *xp,x0,x1,x2;
1257:   PetscInt        nz,k;
1258:   const PetscInt  *vj;

1261:   for (k=0; k<mbs; k++) {
1262:     v  = aa + 9*ai[k];
1263:     xp = x + k*3;
1264:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1265:     nz = ai[k+1] - ai[k];
1266:     vj = aj + ai[k];
1267:     xp = x + (*vj)*3;
1268:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1269:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1270:     while (nz--) {
1271:       /* x(:) += U(k,:)^T*(Dk*xk) */
1272:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275:       vj++;
1276:       xp = x + (*vj)*3;
1277:       v += 9;
1278:     }
1279:     /* xk = inv(Dk)*(Dk*xk) */
1280:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1281:     xp    = x + k*3;
1282:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1283:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1284:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1285:   }
1286:   return(0);
1287: }

1289: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1290: {
1291:   const MatScalar *v;
1292:   PetscScalar     *xp,x0,x1,x2;
1293:   PetscInt        nz,k;
1294:   const PetscInt  *vj;

1297:   for (k=mbs-1; k>=0; k--) {
1298:     v  = aa + 9*ai[k];
1299:     xp = x + k*3;
1300:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1301:     nz = ai[k+1] - ai[k];
1302:     vj = aj + ai[k];
1303:     xp = x + (*vj)*3;
1304:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1305:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1306:     while (nz--) {
1307:       /* xk += U(k,:)*x(:) */
1308:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1309:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1310:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1311:       vj++;
1312:       v += 9; xp = x + (*vj)*3;
1313:     }
1314:     xp    = x + k*3;
1315:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1316:   }
1317:   return(0);
1318: }

1320: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1321: {
1322:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1323:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1324:   const MatScalar   *aa=a->a;
1325:   PetscScalar       *x;
1326:   const PetscScalar *b;
1327:   PetscErrorCode    ierr;

1330:   VecGetArrayRead(bb,&b);
1331:   VecGetArray(xx,&x);

1333:   /* solve U^T * D * y = b by forward substitution */
1334:   PetscArraycpy(x,b,3*mbs);
1335:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1337:   /* solve U*x = y by back substitution */
1338:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1340:   VecRestoreArrayRead(bb,&b);
1341:   VecRestoreArray(xx,&x);
1342:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1343:   return(0);
1344: }

1346: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1347: {
1348:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1349:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1350:   const MatScalar   *aa=a->a;
1351:   PetscScalar       *x;
1352:   const PetscScalar *b;
1353:   PetscErrorCode    ierr;

1356:   VecGetArrayRead(bb,&b);
1357:   VecGetArray(xx,&x);
1358:   PetscArraycpy(x,b,3*mbs);
1359:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1360:   VecRestoreArrayRead(bb,&b);
1361:   VecRestoreArray(xx,&x);
1362:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1363:   return(0);
1364: }

1366: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1367: {
1368:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1369:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1370:   const MatScalar   *aa=a->a;
1371:   PetscScalar       *x;
1372:   const PetscScalar *b;
1373:   PetscErrorCode    ierr;

1376:   VecGetArrayRead(bb,&b);
1377:   VecGetArray(xx,&x);
1378:   PetscArraycpy(x,b,3*mbs);
1379:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1380:   VecRestoreArrayRead(bb,&b);
1381:   VecRestoreArray(xx,&x);
1382:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1383:   return(0);
1384: }

1386: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1387: {
1388:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1389:   IS                isrow=a->row;
1390:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1391:   PetscErrorCode    ierr;
1392:   const PetscInt    *r,*vj;
1393:   PetscInt          nz,k,k2,idx;
1394:   const MatScalar   *aa=a->a,*v,*diag;
1395:   PetscScalar       *x,x0,x1,*t;
1396:   const PetscScalar *b;

1399:   VecGetArrayRead(bb,&b);
1400:   VecGetArray(xx,&x);
1401:   t    = a->solve_work;
1402:   ISGetIndices(isrow,&r);

1404:   /* solve U^T * D * y = perm(b) by forward substitution */
1405:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1406:     idx      = 2*r[k];
1407:     t[k*2]   = b[idx];
1408:     t[k*2+1] = b[idx+1];
1409:   }
1410:   for (k=0; k<mbs; k++) {
1411:     v  = aa + 4*ai[k];
1412:     vj = aj + ai[k];
1413:     k2 = k*2;
1414:     x0 = t[k2]; x1 = t[k2+1];
1415:     nz = ai[k+1] - ai[k];
1416:     while (nz--) {
1417:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1418:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1419:       vj++; v      += 4;
1420:     }
1421:     diag    = aa+k*4; /* ptr to inv(Dk) */
1422:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1423:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1424:   }

1426:   /* solve U*x = y by back substitution */
1427:   for (k=mbs-1; k>=0; k--) {
1428:     v  = aa + 4*ai[k];
1429:     vj = aj + ai[k];
1430:     k2 = k*2;
1431:     x0 = t[k2]; x1 = t[k2+1];
1432:     nz = ai[k+1] - ai[k];
1433:     while (nz--) {
1434:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1435:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1436:       vj++;
1437:       v += 4;
1438:     }
1439:     t[k2]    = x0;
1440:     t[k2+1]  = x1;
1441:     idx      = 2*r[k];
1442:     x[idx]   = x0;
1443:     x[idx+1] = x1;
1444:   }

1446:   ISRestoreIndices(isrow,&r);
1447:   VecRestoreArrayRead(bb,&b);
1448:   VecRestoreArray(xx,&x);
1449:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1450:   return(0);
1451: }

1453: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1454: {
1455:   const MatScalar *v,*diag;
1456:   PetscScalar     x0,x1;
1457:   PetscInt        nz,k,k2;
1458:   const PetscInt  *vj;

1461:   for (k=0; k<mbs; k++) {
1462:     v  = aa + 4*ai[k];
1463:     vj = aj + ai[k];
1464:     k2 = k*2;
1465:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1466:     nz = ai[k+1] - ai[k];
1467:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1468:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1469:     while (nz--) {
1470:       /* x(:) += U(k,:)^T*(Dk*xk) */
1471:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1472:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1473:       vj++; v      += 4;
1474:     }
1475:     /* xk = inv(Dk)*(Dk*xk) */
1476:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1477:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1478:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1479:   }
1480:   return(0);
1481: }

1483: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1484: {
1485:   const MatScalar *v;
1486:   PetscScalar     x0,x1;
1487:   PetscInt        nz,k,k2;
1488:   const PetscInt  *vj;

1491:   for (k=mbs-1; k>=0; k--) {
1492:     v  = aa + 4*ai[k];
1493:     vj = aj + ai[k];
1494:     k2 = k*2;
1495:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1496:     nz = ai[k+1] - ai[k];
1497:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1498:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1499:     while (nz--) {
1500:       /* xk += U(k,:)*x(:) */
1501:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1502:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1503:       vj++;
1504:       v += 4;
1505:     }
1506:     x[k2]   = x0;
1507:     x[k2+1] = x1;
1508:   }
1509:   return(0);
1510: }

1512: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1513: {
1514:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1515:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1516:   const MatScalar   *aa=a->a;
1517:   PetscScalar       *x;
1518:   const PetscScalar *b;
1519:   PetscErrorCode    ierr;

1522:   VecGetArrayRead(bb,&b);
1523:   VecGetArray(xx,&x);

1525:   /* solve U^T * D * y = b by forward substitution */
1526:   PetscArraycpy(x,b,2*mbs);
1527:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1529:   /* solve U*x = y by back substitution */
1530:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1532:   VecRestoreArrayRead(bb,&b);
1533:   VecRestoreArray(xx,&x);
1534:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1535:   return(0);
1536: }

1538: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1539: {
1540:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1541:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1542:   const MatScalar   *aa=a->a;
1543:   PetscScalar       *x;
1544:   const PetscScalar *b;
1545:   PetscErrorCode    ierr;

1548:   VecGetArrayRead(bb,&b);
1549:   VecGetArray(xx,&x);
1550:   PetscArraycpy(x,b,2*mbs);
1551:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1552:   VecRestoreArrayRead(bb,&b);
1553:   VecRestoreArray(xx,&x);
1554:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1555:   return(0);
1556: }

1558: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1559: {
1560:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1561:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1562:   const MatScalar   *aa=a->a;
1563:   PetscScalar       *x;
1564:   const PetscScalar *b;
1565:   PetscErrorCode    ierr;

1568:   VecGetArrayRead(bb,&b);
1569:   VecGetArray(xx,&x);
1570:   PetscArraycpy(x,b,2*mbs);
1571:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1572:   VecRestoreArrayRead(bb,&b);
1573:   VecRestoreArray(xx,&x);
1574:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1575:   return(0);
1576: }

1578: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1579: {
1580:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1581:   IS                isrow=a->row;
1582:   PetscErrorCode    ierr;
1583:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1584:   const MatScalar   *aa=a->a,*v;
1585:   const PetscScalar *b;
1586:   PetscScalar       *x,xk,*t;
1587:   PetscInt          nz,k,j;

1590:   VecGetArrayRead(bb,&b);
1591:   VecGetArray(xx,&x);
1592:   t    = a->solve_work;
1593:   ISGetIndices(isrow,&rp);

1595:   /* solve U^T*D*y = perm(b) by forward substitution */
1596:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1597:   for (k=0; k<mbs; k++) {
1598:     v  = aa + ai[k];
1599:     vj = aj + ai[k];
1600:     xk = t[k];
1601:     nz = ai[k+1] - ai[k] - 1;
1602:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1603:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1604:   }

1606:   /* solve U*perm(x) = y by back substitution */
1607:   for (k=mbs-1; k>=0; k--) {
1608:     v  = aa + adiag[k] - 1;
1609:     vj = aj + adiag[k] - 1;
1610:     nz = ai[k+1] - ai[k] - 1;
1611:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1612:     x[rp[k]] = t[k];
1613:   }

1615:   ISRestoreIndices(isrow,&rp);
1616:   VecRestoreArrayRead(bb,&b);
1617:   VecRestoreArray(xx,&x);
1618:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1619:   return(0);
1620: }

1622: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1623: {
1624:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1625:   IS                isrow=a->row;
1626:   PetscErrorCode    ierr;
1627:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1628:   const MatScalar   *aa=a->a,*v;
1629:   PetscScalar       *x,xk,*t;
1630:   const PetscScalar *b;
1631:   PetscInt          nz,k;

1634:   VecGetArrayRead(bb,&b);
1635:   VecGetArray(xx,&x);
1636:   t    = a->solve_work;
1637:   ISGetIndices(isrow,&rp);

1639:   /* solve U^T*D*y = perm(b) by forward substitution */
1640:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1641:   for (k=0; k<mbs; k++) {
1642:     v  = aa + ai[k] + 1;
1643:     vj = aj + ai[k] + 1;
1644:     xk = t[k];
1645:     nz = ai[k+1] - ai[k] - 1;
1646:     while (nz--) t[*vj++] += (*v++) * xk;
1647:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1648:   }

1650:   /* solve U*perm(x) = y by back substitution */
1651:   for (k=mbs-1; k>=0; k--) {
1652:     v  = aa + ai[k] + 1;
1653:     vj = aj + ai[k] + 1;
1654:     nz = ai[k+1] - ai[k] - 1;
1655:     while (nz--) t[k] += (*v++) * t[*vj++];
1656:     x[rp[k]] = t[k];
1657:   }

1659:   ISRestoreIndices(isrow,&rp);
1660:   VecRestoreArrayRead(bb,&b);
1661:   VecRestoreArray(xx,&x);
1662:   PetscLogFlops(4.0*a->nz - 3*mbs);
1663:   return(0);
1664: }

1666: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1667: {
1668:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1669:   IS                isrow=a->row;
1670:   PetscErrorCode    ierr;
1671:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1672:   const MatScalar   *aa=a->a,*v;
1673:   PetscReal         diagk;
1674:   PetscScalar       *x,xk;
1675:   const PetscScalar *b;
1676:   PetscInt          nz,k;

1679:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1680:   VecGetArrayRead(bb,&b);
1681:   VecGetArray(xx,&x);
1682:   ISGetIndices(isrow,&rp);

1684:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1685:   for (k=0; k<mbs; k++) {
1686:     v  = aa + ai[k];
1687:     vj = aj + ai[k];
1688:     xk = x[k];
1689:     nz = ai[k+1] - ai[k] - 1;
1690:     while (nz--) x[*vj++] += (*v++) * xk;

1692:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1693:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1694:     x[k] = xk*PetscSqrtReal(diagk);
1695:   }
1696:   ISRestoreIndices(isrow,&rp);
1697:   VecRestoreArrayRead(bb,&b);
1698:   VecRestoreArray(xx,&x);
1699:   PetscLogFlops(2.0*a->nz - mbs);
1700:   return(0);
1701: }

1703: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1704: {
1705:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1706:   IS                isrow=a->row;
1707:   PetscErrorCode    ierr;
1708:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1709:   const MatScalar   *aa=a->a,*v;
1710:   PetscReal         diagk;
1711:   PetscScalar       *x,xk;
1712:   const PetscScalar *b;
1713:   PetscInt          nz,k;

1716:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1717:   VecGetArrayRead(bb,&b);
1718:   VecGetArray(xx,&x);
1719:   ISGetIndices(isrow,&rp);

1721:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1722:   for (k=0; k<mbs; k++) {
1723:     v  = aa + ai[k] + 1;
1724:     vj = aj + ai[k] + 1;
1725:     xk = x[k];
1726:     nz = ai[k+1] - ai[k] - 1;
1727:     while (nz--) x[*vj++] += (*v++) * xk;

1729:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1730:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1731:     x[k] = xk*PetscSqrtReal(diagk);
1732:   }
1733:   ISRestoreIndices(isrow,&rp);
1734:   VecRestoreArrayRead(bb,&b);
1735:   VecRestoreArray(xx,&x);
1736:   PetscLogFlops(2.0*a->nz - mbs);
1737:   return(0);
1738: }

1740: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1741: {
1742:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1743:   IS                isrow=a->row;
1744:   PetscErrorCode    ierr;
1745:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1746:   const MatScalar   *aa=a->a,*v;
1747:   PetscReal         diagk;
1748:   PetscScalar       *x,*t;
1749:   const PetscScalar *b;
1750:   PetscInt          nz,k;

1753:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1754:   VecGetArrayRead(bb,&b);
1755:   VecGetArray(xx,&x);
1756:   t    = a->solve_work;
1757:   ISGetIndices(isrow,&rp);

1759:   for (k=mbs-1; k>=0; k--) {
1760:     v     = aa + ai[k];
1761:     vj    = aj + ai[k];
1762:     diagk = PetscRealPart(aa[adiag[k]]);
1763:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1764:     t[k] = b[k] * PetscSqrtReal(diagk);
1765:     nz   = ai[k+1] - ai[k] - 1;
1766:     while (nz--) t[k] += (*v++) * t[*vj++];
1767:     x[rp[k]] = t[k];
1768:   }
1769:   ISRestoreIndices(isrow,&rp);
1770:   VecRestoreArrayRead(bb,&b);
1771:   VecRestoreArray(xx,&x);
1772:   PetscLogFlops(2.0*a->nz - mbs);
1773:   return(0);
1774: }

1776: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1777: {
1778:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1779:   IS                isrow=a->row;
1780:   PetscErrorCode    ierr;
1781:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1782:   const MatScalar   *aa=a->a,*v;
1783:   PetscReal         diagk;
1784:   PetscScalar       *x,*t;
1785:   const PetscScalar *b;
1786:   PetscInt          nz,k;

1789:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1790:   VecGetArrayRead(bb,&b);
1791:   VecGetArray(xx,&x);
1792:   t    = a->solve_work;
1793:   ISGetIndices(isrow,&rp);

1795:   for (k=mbs-1; k>=0; k--) {
1796:     v     = aa + ai[k] + 1;
1797:     vj    = aj + ai[k] + 1;
1798:     diagk = PetscRealPart(aa[ai[k]]);
1799:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1800:     t[k] = b[k] * PetscSqrtReal(diagk);
1801:     nz   = ai[k+1] - ai[k] - 1;
1802:     while (nz--) t[k] += (*v++) * t[*vj++];
1803:     x[rp[k]] = t[k];
1804:   }
1805:   ISRestoreIndices(isrow,&rp);
1806:   VecRestoreArrayRead(bb,&b);
1807:   VecRestoreArray(xx,&x);
1808:   PetscLogFlops(2.0*a->nz - mbs);
1809:   return(0);
1810: }

1812: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1813: {
1814:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1818:   if (A->rmap->bs == 1) {
1819:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1820:   } else {
1821:     IS                isrow=a->row;
1822:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1823:     const MatScalar   *aa=a->a,*v;
1824:     PetscScalar       *x,*t;
1825:     const PetscScalar *b;
1826:     PetscInt          nz,k,n,i,j;

1828:     if (bb->n > a->solves_work_n) {
1829:       PetscFree(a->solves_work);
1830:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1831:       a->solves_work_n = bb->n;
1832:     }
1833:     n    = bb->n;
1834:     VecGetArrayRead(bb->v,&b);
1835:     VecGetArray(xx->v,&x);
1836:     t    = a->solves_work;

1838:     ISGetIndices(isrow,&rp);

1840:     /* solve U^T*D*y = perm(b) by forward substitution */
1841:     for (k=0; k<mbs; k++) {
1842:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1843:     }
1844:     for (k=0; k<mbs; k++) {
1845:       v  = aa + ai[k];
1846:       vj = aj + ai[k];
1847:       nz = ai[k+1] - ai[k] - 1;
1848:       for (j=0; j<nz; j++) {
1849:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1850:         v++;vj++;
1851:       }
1852:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1853:     }

1855:     /* solve U*perm(x) = y by back substitution */
1856:     for (k=mbs-1; k>=0; k--) {
1857:       v  = aa + ai[k] - 1;
1858:       vj = aj + ai[k] - 1;
1859:       nz = ai[k+1] - ai[k] - 1;
1860:       for (j=0; j<nz; j++) {
1861:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1862:         v++;vj++;
1863:       }
1864:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1865:     }

1867:     ISRestoreIndices(isrow,&rp);
1868:     VecRestoreArrayRead(bb->v,&b);
1869:     VecRestoreArray(xx->v,&x);
1870:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1871:   }
1872:   return(0);
1873: }

1875: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1876: {
1877:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1881:   if (A->rmap->bs == 1) {
1882:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1883:   } else {
1884:     IS                isrow=a->row;
1885:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1886:     const MatScalar   *aa=a->a,*v;
1887:     PetscScalar       *x,*t;
1888:     const PetscScalar *b;
1889:     PetscInt          nz,k,n,i;

1891:     if (bb->n > a->solves_work_n) {
1892:       PetscFree(a->solves_work);
1893:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1894:       a->solves_work_n = bb->n;
1895:     }
1896:     n    = bb->n;
1897:     VecGetArrayRead(bb->v,&b);
1898:     VecGetArray(xx->v,&x);
1899:     t    = a->solves_work;

1901:     ISGetIndices(isrow,&rp);

1903:     /* solve U^T*D*y = perm(b) by forward substitution */
1904:     for (k=0; k<mbs; k++) {
1905:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1906:     }
1907:     for (k=0; k<mbs; k++) {
1908:       v  = aa + ai[k];
1909:       vj = aj + ai[k];
1910:       nz = ai[k+1] - ai[k];
1911:       while (nz--) {
1912:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1913:         v++;vj++;
1914:       }
1915:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1916:     }

1918:     /* solve U*perm(x) = y by back substitution */
1919:     for (k=mbs-1; k>=0; k--) {
1920:       v  = aa + ai[k];
1921:       vj = aj + ai[k];
1922:       nz = ai[k+1] - ai[k];
1923:       while (nz--) {
1924:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1925:         v++;vj++;
1926:       }
1927:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1928:     }

1930:     ISRestoreIndices(isrow,&rp);
1931:     VecRestoreArrayRead(bb->v,&b);
1932:     VecRestoreArray(xx->v,&x);
1933:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1934:   }
1935:   return(0);
1936: }

1938: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1939: {
1940:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1941:   PetscErrorCode    ierr;
1942:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1943:   const MatScalar   *aa=a->a,*v;
1944:   const PetscScalar *b;
1945:   PetscScalar       *x,xi;
1946:   PetscInt          nz,i,j;

1949:   VecGetArrayRead(bb,&b);
1950:   VecGetArray(xx,&x);
1951:   /* solve U^T*D*y = b by forward substitution */
1952:   PetscArraycpy(x,b,mbs);
1953:   for (i=0; i<mbs; i++) {
1954:     v  = aa + ai[i];
1955:     vj = aj + ai[i];
1956:     xi = x[i];
1957:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1958:     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1959:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1960:   }
1961:   /* solve U*x = y by backward substitution */
1962:   for (i=mbs-2; i>=0; i--) {
1963:     xi = x[i];
1964:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1965:     vj = aj + adiag[i] - 1;
1966:     nz = ai[i+1] - ai[i] - 1;
1967:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1968:     x[i] = xi;
1969:   }
1970:   VecRestoreArrayRead(bb,&b);
1971:   VecRestoreArray(xx,&x);
1972:   PetscLogFlops(4.0*a->nz - 3*mbs);
1973:   return(0);
1974: }

1976: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1977: {
1978:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1979:   PetscErrorCode    ierr;
1980:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1981:   const MatScalar   *aa=a->a,*v;
1982:   const PetscScalar *b;
1983:   PetscScalar       *x,xi;
1984:   PetscInt          nz,i,j,neq,ldb,ldx;
1985:   PetscBool         isdense;

1988:   if (!mbs) return(0);
1989:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1990:   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1991:   if (X != B) {
1992:     PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1993:     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1994:   }
1995:   MatDenseGetArrayRead(B,&b);
1996:   MatDenseGetLDA(B,&ldb);
1997:   MatDenseGetArray(X,&x);
1998:   MatDenseGetLDA(X,&ldx);
1999:   for (neq=0; neq<B->cmap->n; neq++) {
2000:     /* solve U^T*D*y = b by forward substitution */
2001:     PetscArraycpy(x,b,mbs);
2002:     for (i=0; i<mbs; i++) {
2003:       v  = aa + ai[i];
2004:       vj = aj + ai[i];
2005:       xi = x[i];
2006:       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2007:       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2008:       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2009:     }
2010:     /* solve U*x = y by backward substitution */
2011:     for (i=mbs-2; i>=0; i--) {
2012:       xi = x[i];
2013:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2014:       vj = aj + adiag[i] - 1;
2015:       nz = ai[i+1] - ai[i] - 1;
2016:       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2017:       x[i] = xi;
2018:     }
2019:     b += ldb;
2020:     x += ldx;
2021:   }
2022:   MatDenseRestoreArrayRead(B,&b);
2023:   MatDenseRestoreArray(X,&x);
2024:   PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2025:   return(0);
2026: }

2028: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2029: {
2030:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2031:   PetscErrorCode    ierr;
2032:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2033:   const MatScalar   *aa=a->a,*v;
2034:   PetscScalar       *x,xk;
2035:   const PetscScalar *b;
2036:   PetscInt          nz,k;

2039:   VecGetArrayRead(bb,&b);
2040:   VecGetArray(xx,&x);

2042:   /* solve U^T*D*y = b by forward substitution */
2043:   PetscArraycpy(x,b,mbs);
2044:   for (k=0; k<mbs; k++) {
2045:     v  = aa + ai[k] + 1;
2046:     vj = aj + ai[k] + 1;
2047:     xk = x[k];
2048:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2049:     while (nz--) x[*vj++] += (*v++) * xk;
2050:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2051:   }

2053:   /* solve U*x = y by back substitution */
2054:   for (k=mbs-2; k>=0; k--) {
2055:     v  = aa + ai[k] + 1;
2056:     vj = aj + ai[k] + 1;
2057:     xk = x[k];
2058:     nz = ai[k+1] - ai[k] - 1;
2059:     while (nz--) {
2060:       xk += (*v++) * x[*vj++];
2061:     }
2062:     x[k] = xk;
2063:   }

2065:   VecRestoreArrayRead(bb,&b);
2066:   VecRestoreArray(xx,&x);
2067:   PetscLogFlops(4.0*a->nz - 3*mbs);
2068:   return(0);
2069: }

2071: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2072: {
2073:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2074:   PetscErrorCode    ierr;
2075:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2076:   const MatScalar   *aa=a->a,*v;
2077:   PetscReal         diagk;
2078:   PetscScalar       *x;
2079:   const PetscScalar *b;
2080:   PetscInt          nz,k;

2083:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2084:   VecGetArrayRead(bb,&b);
2085:   VecGetArray(xx,&x);
2086:   PetscArraycpy(x,b,mbs);
2087:   for (k=0; k<mbs; k++) {
2088:     v  = aa + ai[k];
2089:     vj = aj + ai[k];
2090:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2091:     while (nz--) x[*vj++] += (*v++) * x[k];
2092:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2093:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2094:     x[k] *= PetscSqrtReal(diagk);
2095:   }
2096:   VecRestoreArrayRead(bb,&b);
2097:   VecRestoreArray(xx,&x);
2098:   PetscLogFlops(2.0*a->nz - mbs);
2099:   return(0);
2100: }

2102: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2103: {
2104:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2105:   PetscErrorCode    ierr;
2106:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2107:   const MatScalar   *aa=a->a,*v;
2108:   PetscReal         diagk;
2109:   PetscScalar       *x;
2110:   const PetscScalar *b;
2111:   PetscInt          nz,k;

2114:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2115:   VecGetArrayRead(bb,&b);
2116:   VecGetArray(xx,&x);
2117:   PetscArraycpy(x,b,mbs);
2118:   for (k=0; k<mbs; k++) {
2119:     v  = aa + ai[k] + 1;
2120:     vj = aj + ai[k] + 1;
2121:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2122:     while (nz--) x[*vj++] += (*v++) * x[k];
2123:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2124:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2125:     x[k] *= PetscSqrtReal(diagk);
2126:   }
2127:   VecRestoreArrayRead(bb,&b);
2128:   VecRestoreArray(xx,&x);
2129:   PetscLogFlops(2.0*a->nz - mbs);
2130:   return(0);
2131: }

2133: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2134: {
2135:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2136:   PetscErrorCode    ierr;
2137:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2138:   const MatScalar   *aa=a->a,*v;
2139:   PetscReal         diagk;
2140:   PetscScalar       *x;
2141:   const PetscScalar *b;
2142:   PetscInt          nz,k;

2145:   /* solve D^(1/2)*U*x = b by back substitution */
2146:   VecGetArrayRead(bb,&b);
2147:   VecGetArray(xx,&x);

2149:   for (k=mbs-1; k>=0; k--) {
2150:     v     = aa + ai[k];
2151:     vj    = aj + ai[k];
2152:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2153:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2154:     x[k] = PetscSqrtReal(diagk)*b[k];
2155:     nz   = ai[k+1] - ai[k] - 1;
2156:     while (nz--) x[k] += (*v++) * x[*vj++];
2157:   }
2158:   VecRestoreArrayRead(bb,&b);
2159:   VecRestoreArray(xx,&x);
2160:   PetscLogFlops(2.0*a->nz - mbs);
2161:   return(0);
2162: }

2164: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2165: {
2166:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2167:   PetscErrorCode    ierr;
2168:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2169:   const MatScalar   *aa=a->a,*v;
2170:   PetscReal         diagk;
2171:   PetscScalar       *x;
2172:   const PetscScalar *b;
2173:   PetscInt          nz,k;

2176:   /* solve D^(1/2)*U*x = b by back substitution */
2177:   VecGetArrayRead(bb,&b);
2178:   VecGetArray(xx,&x);

2180:   for (k=mbs-1; k>=0; k--) {
2181:     v     = aa + ai[k] + 1;
2182:     vj    = aj + ai[k] + 1;
2183:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2184:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2185:     x[k] = PetscSqrtReal(diagk)*b[k];
2186:     nz   = ai[k+1] - ai[k] - 1;
2187:     while (nz--) x[k] += (*v++) * x[*vj++];
2188:   }
2189:   VecRestoreArrayRead(bb,&b);
2190:   VecRestoreArray(xx,&x);
2191:   PetscLogFlops(2.0*a->nz - mbs);
2192:   return(0);
2193: }

2195: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2196: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2197: {
2198:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2200:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2201:   PetscInt       *jutmp,bs = A->rmap->bs,i;
2202:   PetscInt       m,reallocs = 0,*levtmp;
2203:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2204:   PetscInt       incrlev,*lev,shift,prow,nz;
2205:   PetscReal      f = info->fill,levels = info->levels;
2206:   PetscBool      perm_identity;

2209:   /* check whether perm is the identity mapping */
2210:   ISIdentity(perm,&perm_identity);

2212:   if (perm_identity) {
2213:     a->permute = PETSC_FALSE;
2214:     ai         = a->i; aj = a->j;
2215:   } else { /*  non-trivial permutation */
2216:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2217:   }

2219:   /* initialization */
2220:   ISGetIndices(perm,&rip);
2221:   umax  = (PetscInt)(f*ai[mbs] + 1);
2222:   PetscMalloc1(umax,&lev);
2223:   umax += mbs + 1;
2224:   shift = mbs + 1;
2225:   PetscMalloc1(mbs+1,&iu);
2226:   PetscMalloc1(umax,&ju);
2227:   iu[0] = mbs + 1;
2228:   juidx = mbs + 1;
2229:   /* prowl: linked list for pivot row */
2230:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2231:   /* q: linked list for col index */

2233:   for (i=0; i<mbs; i++) {
2234:     prowl[i] = mbs;
2235:     q[i]     = 0;
2236:   }

2238:   /* for each row k */
2239:   for (k=0; k<mbs; k++) {
2240:     nzk  = 0;
2241:     q[k] = mbs;
2242:     /* copy current row into linked list */
2243:     nz = ai[rip[k]+1] - ai[rip[k]];
2244:     j  = ai[rip[k]];
2245:     while (nz--) {
2246:       vj = rip[aj[j++]];
2247:       if (vj > k) {
2248:         qm = k;
2249:         do {
2250:           m = qm; qm = q[m];
2251:         } while (qm < vj);
2252:         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2253:         nzk++;
2254:         q[m]       = vj;
2255:         q[vj]      = qm;
2256:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2257:       }
2258:     }

2260:     /* modify nonzero structure of k-th row by computing fill-in
2261:        for each row prow to be merged in */
2262:     prow = k;
2263:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2265:     while (prow < k) {
2266:       /* merge row prow into k-th row */
2267:       jmin = iu[prow] + 1;
2268:       jmax = iu[prow+1];
2269:       qm   = k;
2270:       for (j=jmin; j<jmax; j++) {
2271:         incrlev = lev[j-shift] + 1;
2272:         if (incrlev > levels) continue;
2273:         vj = ju[j];
2274:         do {
2275:           m = qm; qm = q[m];
2276:         } while (qm < vj);
2277:         if (qm != vj) {      /* a fill */
2278:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2279:           levtmp[vj] = incrlev;
2280:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2281:       }
2282:       prow = prowl[prow]; /* next pivot row */
2283:     }

2285:     /* add k to row list for first nonzero element in k-th row */
2286:     if (nzk > 1) {
2287:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2288:       prowl[k] = prowl[i]; prowl[i] = k;
2289:     }
2290:     iu[k+1] = iu[k] + nzk;

2292:     /* allocate more space to ju and lev if needed */
2293:     if (iu[k+1] > umax) {
2294:       /* estimate how much additional space we will need */
2295:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2296:       /* just double the memory each time */
2297:       maxadd = umax;
2298:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2299:       umax += maxadd;

2301:       /* allocate a longer ju */
2302:       PetscMalloc1(umax,&jutmp);
2303:       PetscArraycpy(jutmp,ju,iu[k]);
2304:       PetscFree(ju);
2305:       ju   = jutmp;

2307:       PetscMalloc1(umax,&jutmp);
2308:       PetscArraycpy(jutmp,lev,iu[k]-shift);
2309:       PetscFree(lev);
2310:       lev       = jutmp;
2311:       reallocs += 2; /* count how many times we realloc */
2312:     }

2314:     /* save nonzero structure of k-th row in ju */
2315:     i=k;
2316:     while (nzk--) {
2317:       i                = q[i];
2318:       ju[juidx]        = i;
2319:       lev[juidx-shift] = levtmp[i];
2320:       juidx++;
2321:     }
2322:   }

2324: #if defined(PETSC_USE_INFO)
2325:   if (ai[mbs] != 0) {
2326:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2327:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2328:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2329:     PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2330:     PetscInfo(A,"for best performance.\n");
2331:   } else {
2332:     PetscInfo(A,"Empty matrix\n");
2333:   }
2334: #endif

2336:   ISRestoreIndices(perm,&rip);
2337:   PetscFree3(prowl,q,levtmp);
2338:   PetscFree(lev);

2340:   /* put together the new matrix */
2341:   MatSeqSBAIJSetPreallocation(B,bs,0,NULL);

2343:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2344:   b    = (Mat_SeqSBAIJ*)(B)->data;
2345:   PetscFree2(b->imax,b->ilen);

2347:   b->singlemalloc = PETSC_FALSE;
2348:   b->free_a       = PETSC_TRUE;
2349:   b->free_ij      = PETSC_TRUE;
2350:   /* the next line frees the default space generated by the Create() */
2351:   PetscFree3(b->a,b->j,b->i);
2352:   PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2353:   b->j    = ju;
2354:   b->i    = iu;
2355:   b->diag = NULL;
2356:   b->ilen = NULL;
2357:   b->imax = NULL;

2359:   ISDestroy(&b->row);
2360:   ISDestroy(&b->icol);
2361:   b->row  = perm;
2362:   b->icol = perm;
2363:   PetscObjectReference((PetscObject)perm);
2364:   PetscObjectReference((PetscObject)perm);
2365:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2366:   /* In b structure:  Free imax, ilen, old a, old j.
2367:      Allocate idnew, solve_work, new a, new j */
2368:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2369:   b->maxnz = b->nz = iu[mbs];

2371:   (B)->info.factor_mallocs   = reallocs;
2372:   (B)->info.fill_ratio_given = f;
2373:   if (ai[mbs] != 0) {
2374:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2375:   } else {
2376:     (B)->info.fill_ratio_needed = 0.0;
2377:   }
2378:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2379:   return(0);
2380: }

2382: /*
2383:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2384: */
2385: #include <petscbt.h>
2386: #include <../src/mat/utils/freespace.h>
2387: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2388: {
2389:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2390:   PetscErrorCode     ierr;
2391:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2392:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2393:   const PetscInt     *rip;
2394:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2395:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2396:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2397:   PetscReal          fill          =info->fill,levels=info->levels;
2398:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2399:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2400:   PetscBT            lnkbt;

2403:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2404:   MatMissingDiagonal(A,&missing,&d);
2405:   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2406:   if (bs > 1) {
2407:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2408:     return(0);
2409:   }

2411:   /* check whether perm is the identity mapping */
2412:   ISIdentity(perm,&perm_identity);
2413:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2414:   a->permute = PETSC_FALSE;

2416:   PetscMalloc1(am+1,&ui);
2417:   PetscMalloc1(am+1,&udiag);
2418:   ui[0] = 0;

2420:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2421:   if (!levels) {
2422:     /* reuse the column pointers and row offsets for memory savings */
2423:     for (i=0; i<am; i++) {
2424:       ncols    = ai[i+1] - ai[i];
2425:       ui[i+1]  = ui[i] + ncols;
2426:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2427:     }
2428:     PetscMalloc1(ui[am]+1,&uj);
2429:     cols = uj;
2430:     for (i=0; i<am; i++) {
2431:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2432:       ncols = ai[i+1] - ai[i] -1;
2433:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2434:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2435:     }
2436:   } else { /* case: levels>0 */
2437:     ISGetIndices(perm,&rip);

2439:     /* initialization */
2440:     /* jl: linked list for storing indices of the pivot rows
2441:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2442:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2443:     for (i=0; i<am; i++) {
2444:       jl[i] = am; il[i] = 0;
2445:     }

2447:     /* create and initialize a linked list for storing column indices of the active row k */
2448:     nlnk = am + 1;
2449:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2451:     /* initial FreeSpace size is fill*(ai[am]+1) */
2452:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2454:     current_space = free_space;

2456:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2458:     current_space_lvl = free_space_lvl;

2460:     for (k=0; k<am; k++) {  /* for each active row k */
2461:       /* initialize lnk by the column indices of row k */
2462:       nzk   = 0;
2463:       ncols = ai[k+1] - ai[k];
2464:       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2465:       cols = aj+ai[k];
2466:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2467:       nzk += nlnk;

2469:       /* update lnk by computing fill-in for each pivot row to be merged in */
2470:       prow = jl[k]; /* 1st pivot row */

2472:       while (prow < k) {
2473:         nextprow = jl[prow];

2475:         /* merge prow into k-th row */
2476:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2477:         jmax  = ui[prow+1];
2478:         ncols = jmax-jmin;
2479:         i     = jmin - ui[prow];

2481:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2482:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2483:         j    = *(uj - 1);
2484:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2485:         nzk += nlnk;

2487:         /* update il and jl for prow */
2488:         if (jmin < jmax) {
2489:           il[prow] = jmin;

2491:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2492:         }
2493:         prow = nextprow;
2494:       }

2496:       /* if free space is not available, make more free space */
2497:       if (current_space->local_remaining<nzk) {
2498:         i    = am - k + 1; /* num of unfactored rows */
2499:         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2500:         PetscFreeSpaceGet(i,&current_space);
2501:         PetscFreeSpaceGet(i,&current_space_lvl);
2502:         reallocs++;
2503:       }

2505:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2506:       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2507:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2509:       /* add the k-th row into il and jl */
2510:       if (nzk > 1) {
2511:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2512:         jl[k] = jl[i]; jl[i] = k;
2513:         il[k] = ui[k] + 1;
2514:       }
2515:       uj_ptr[k]     = current_space->array;
2516:       uj_lvl_ptr[k] = current_space_lvl->array;

2518:       current_space->array               += nzk;
2519:       current_space->local_used          += nzk;
2520:       current_space->local_remaining     -= nzk;
2521:       current_space_lvl->array           += nzk;
2522:       current_space_lvl->local_used      += nzk;
2523:       current_space_lvl->local_remaining -= nzk;

2525:       ui[k+1] = ui[k] + nzk;
2526:     }

2528:     ISRestoreIndices(perm,&rip);
2529:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2531:     /* destroy list of free space and other temporary array(s) */
2532:     PetscMalloc1(ui[am]+1,&uj);
2533:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2534:     PetscIncompleteLLDestroy(lnk,lnkbt);
2535:     PetscFreeSpaceDestroy(free_space_lvl);

2537:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2539:   /* put together the new matrix in MATSEQSBAIJ format */
2540:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2542:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2543:   PetscFree2(b->imax,b->ilen);

2545:   b->singlemalloc = PETSC_FALSE;
2546:   b->free_a       = PETSC_TRUE;
2547:   b->free_ij      = free_ij;

2549:   PetscMalloc1(ui[am]+1,&b->a);

2551:   b->j         = uj;
2552:   b->i         = ui;
2553:   b->diag      = udiag;
2554:   b->free_diag = PETSC_TRUE;
2555:   b->ilen      = NULL;
2556:   b->imax      = NULL;
2557:   b->row       = perm;
2558:   b->col       = perm;

2560:   PetscObjectReference((PetscObject)perm);
2561:   PetscObjectReference((PetscObject)perm);

2563:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2565:   PetscMalloc1(am+1,&b->solve_work);
2566:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2568:   b->maxnz = b->nz = ui[am];

2570:   fact->info.factor_mallocs   = reallocs;
2571:   fact->info.fill_ratio_given = fill;
2572:   if (ai[am] != 0) {
2573:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2574:   } else {
2575:     fact->info.fill_ratio_needed = 0.0;
2576:   }
2577: #if defined(PETSC_USE_INFO)
2578:   if (ai[am] != 0) {
2579:     PetscReal af = fact->info.fill_ratio_needed;
2580:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2581:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2582:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2583:   } else {
2584:     PetscInfo(A,"Empty matrix\n");
2585:   }
2586: #endif
2587:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2588:   return(0);
2589: }

2591: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2592: {
2593:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2594:   Mat_SeqSBAIJ       *b;
2595:   PetscErrorCode     ierr;
2596:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2597:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2598:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2599:   PetscInt           reallocs=0,i,*ui;
2600:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2601:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2602:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2603:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2604:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2605:   PetscBT            lnkbt;

2608:   /*
2609:    This code originally uses Modified Sparse Row (MSR) storage
2610:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2611:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2612:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2613:    thus the original code in MSR format is still used for these cases.
2614:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2615:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2616:   */
2617:   if (bs > 1) {
2618:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2619:     return(0);
2620:   }

2622:   /* check whether perm is the identity mapping */
2623:   ISIdentity(perm,&perm_identity);
2624:   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2625:   a->permute = PETSC_FALSE;

2627:   /* special case that simply copies fill pattern */
2628:   if (!levels) {
2629:     /* reuse the column pointers and row offsets for memory savings */
2630:     ui           = a->i;
2631:     uj           = a->j;
2632:     free_ij      = PETSC_FALSE;
2633:     ratio_needed = 1.0;
2634:   } else { /* case: levels>0 */
2635:     ISGetIndices(perm,&rip);

2637:     /* initialization */
2638:     PetscMalloc1(am+1,&ui);
2639:     ui[0] = 0;

2641:     /* jl: linked list for storing indices of the pivot rows
2642:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2643:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2644:     for (i=0; i<am; i++) {
2645:       jl[i] = am; il[i] = 0;
2646:     }

2648:     /* create and initialize a linked list for storing column indices of the active row k */
2649:     nlnk = am + 1;
2650:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2652:     /* initial FreeSpace size is fill*(ai[am]+1) */
2653:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2655:     current_space = free_space;

2657:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2659:     current_space_lvl = free_space_lvl;

2661:     for (k=0; k<am; k++) {  /* for each active row k */
2662:       /* initialize lnk by the column indices of row rip[k] */
2663:       nzk   = 0;
2664:       ncols = ai[rip[k]+1] - ai[rip[k]];
2665:       cols  = aj+ai[rip[k]];
2666:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2667:       nzk  += nlnk;

2669:       /* update lnk by computing fill-in for each pivot row to be merged in */
2670:       prow = jl[k]; /* 1st pivot row */

2672:       while (prow < k) {
2673:         nextprow = jl[prow];

2675:         /* merge prow into k-th row */
2676:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2677:         jmax     = ui[prow+1];
2678:         ncols    = jmax-jmin;
2679:         i        = jmin - ui[prow];
2680:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2681:         j        = *(uj_lvl_ptr[prow] + i - 1);
2682:         cols_lvl = uj_lvl_ptr[prow]+i;
2683:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2684:         nzk     += nlnk;

2686:         /* update il and jl for prow */
2687:         if (jmin < jmax) {
2688:           il[prow] = jmin;
2689:           j        = *cols;
2690:           jl[prow] = jl[j];
2691:           jl[j]    = prow;
2692:         }
2693:         prow = nextprow;
2694:       }

2696:       /* if free space is not available, make more free space */
2697:       if (current_space->local_remaining<nzk) {
2698:         i    = am - k + 1; /* num of unfactored rows */
2699:         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2700:         PetscFreeSpaceGet(i,&current_space);
2701:         PetscFreeSpaceGet(i,&current_space_lvl);
2702:         reallocs++;
2703:       }

2705:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2706:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2708:       /* add the k-th row into il and jl */
2709:       if (nzk-1 > 0) {
2710:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2711:         jl[k] = jl[i]; jl[i] = k;
2712:         il[k] = ui[k] + 1;
2713:       }
2714:       uj_ptr[k]     = current_space->array;
2715:       uj_lvl_ptr[k] = current_space_lvl->array;

2717:       current_space->array               += nzk;
2718:       current_space->local_used          += nzk;
2719:       current_space->local_remaining     -= nzk;
2720:       current_space_lvl->array           += nzk;
2721:       current_space_lvl->local_used      += nzk;
2722:       current_space_lvl->local_remaining -= nzk;

2724:       ui[k+1] = ui[k] + nzk;
2725:     }

2727:     ISRestoreIndices(perm,&rip);
2728:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2730:     /* destroy list of free space and other temporary array(s) */
2731:     PetscMalloc1(ui[am]+1,&uj);
2732:     PetscFreeSpaceContiguous(&free_space,uj);
2733:     PetscIncompleteLLDestroy(lnk,lnkbt);
2734:     PetscFreeSpaceDestroy(free_space_lvl);
2735:     if (ai[am] != 0) {
2736:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2737:     } else {
2738:       ratio_needed = 0.0;
2739:     }
2740:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2742:   /* put together the new matrix in MATSEQSBAIJ format */
2743:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2745:   b = (Mat_SeqSBAIJ*)(fact)->data;

2747:   PetscFree2(b->imax,b->ilen);

2749:   b->singlemalloc = PETSC_FALSE;
2750:   b->free_a       = PETSC_TRUE;
2751:   b->free_ij      = free_ij;

2753:   PetscMalloc1(ui[am]+1,&b->a);

2755:   b->j             = uj;
2756:   b->i             = ui;
2757:   b->diag          = NULL;
2758:   b->ilen          = NULL;
2759:   b->imax          = NULL;
2760:   b->row           = perm;
2761:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2763:   PetscObjectReference((PetscObject)perm);
2764:   b->icol = perm;
2765:   PetscObjectReference((PetscObject)perm);
2766:   PetscMalloc1(am+1,&b->solve_work);

2768:   b->maxnz = b->nz = ui[am];

2770:   fact->info.factor_mallocs    = reallocs;
2771:   fact->info.fill_ratio_given  = fill;
2772:   fact->info.fill_ratio_needed = ratio_needed;
2773: #if defined(PETSC_USE_INFO)
2774:   if (ai[am] != 0) {
2775:     PetscReal af = fact->info.fill_ratio_needed;
2776:     PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2777:     PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2778:     PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2779:   } else {
2780:     PetscInfo(A,"Empty matrix\n");
2781:   }
2782: #endif
2783:   if (perm_identity) {
2784:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2785:   } else {
2786:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2787:   }
2788:   return(0);
2789: }