Actual source code: dvec2.c
1: /*
2: Defines some vector operation functions that are shared by
3: sequential and parallel vectors.
4: */
5: #include <../src/vec/vec/impls/dvecimpl.h>
6: #include <petsc/private/kernels/petscaxpy.h>
8: #if defined(PETSC_USE_FORTRAN_KERNEL_MDOT)
9: #include <../src/vec/vec/impls/seq/ftn-kernels/fmdot.h>
10: PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
11: {
12: const PetscInt n = xin->map->n;
13: PetscInt i = nv, nv_rem = nv & 0x3;
14: PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3;
15: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x;
16: Vec *yy = (Vec *)yin;
18: PetscFunctionBegin;
19: PetscCall(VecGetArrayRead(xin, &x));
20: switch (nv_rem) {
21: case 3:
22: PetscCall(VecGetArrayRead(yy[0], &yy0));
23: PetscCall(VecGetArrayRead(yy[1], &yy1));
24: PetscCall(VecGetArrayRead(yy[2], &yy2));
25: fortranmdot3_(x, yy0, yy1, yy2, &n, &sum0, &sum1, &sum2);
26: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
27: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
28: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
29: z[0] = sum0;
30: z[1] = sum1;
31: z[2] = sum2;
32: break;
33: case 2:
34: PetscCall(VecGetArrayRead(yy[0], &yy0));
35: PetscCall(VecGetArrayRead(yy[1], &yy1));
36: fortranmdot2_(x, yy0, yy1, &n, &sum0, &sum1);
37: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
38: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
39: z[0] = sum0;
40: z[1] = sum1;
41: break;
42: case 1:
43: PetscCall(VecGetArrayRead(yy[0], &yy0));
44: fortranmdot1_(x, yy0, &n, &sum0);
45: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
46: z[0] = sum0;
47: break;
48: case 0:
49: break;
50: }
51: z += nv_rem;
52: i -= nv_rem;
53: yy += nv_rem;
55: while (i > 0) {
56: sum0 = 0.;
57: sum1 = 0.;
58: sum2 = 0.;
59: sum3 = 0.;
60: PetscCall(VecGetArrayRead(yy[0], &yy0));
61: PetscCall(VecGetArrayRead(yy[1], &yy1));
62: PetscCall(VecGetArrayRead(yy[2], &yy2));
63: PetscCall(VecGetArrayRead(yy[3], &yy3));
64: fortranmdot4_(x, yy0, yy1, yy2, yy3, &n, &sum0, &sum1, &sum2, &sum3);
65: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
66: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
67: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
68: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
69: yy += 4;
70: z[0] = sum0;
71: z[1] = sum1;
72: z[2] = sum2;
73: z[3] = sum3;
74: z += 4;
75: i -= 4;
76: }
77: PetscCall(VecRestoreArrayRead(xin, &x));
78: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: #else
83: PetscErrorCode VecMDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
84: {
85: const PetscInt n = xin->map->n;
86: PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
87: PetscScalar sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3, x0, x1, x2, x3;
88: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
89: const Vec *yy = (Vec *)yin;
91: PetscFunctionBegin;
92: if (n == 0) {
93: PetscCall(PetscArrayzero(z, nv));
94: PetscFunctionReturn(PETSC_SUCCESS);
95: }
96: PetscCall(VecGetArrayRead(xin, &xbase));
97: x = xbase;
98: switch (nv_rem) {
99: case 3:
100: PetscCall(VecGetArrayRead(yy[0], &yy0));
101: PetscCall(VecGetArrayRead(yy[1], &yy1));
102: PetscCall(VecGetArrayRead(yy[2], &yy2));
103: switch (j_rem = j & 0x3) {
104: case 3:
105: x2 = x[2];
106: sum0 += x2 * PetscConj(yy0[2]);
107: sum1 += x2 * PetscConj(yy1[2]);
108: sum2 += x2 * PetscConj(yy2[2]); /* fall through */
109: case 2:
110: x1 = x[1];
111: sum0 += x1 * PetscConj(yy0[1]);
112: sum1 += x1 * PetscConj(yy1[1]);
113: sum2 += x1 * PetscConj(yy2[1]); /* fall through */
114: case 1:
115: x0 = x[0];
116: sum0 += x0 * PetscConj(yy0[0]);
117: sum1 += x0 * PetscConj(yy1[0]);
118: sum2 += x0 * PetscConj(yy2[0]); /* fall through */
119: case 0:
120: x += j_rem;
121: yy0 += j_rem;
122: yy1 += j_rem;
123: yy2 += j_rem;
124: j -= j_rem;
125: break;
126: }
127: while (j > 0) {
128: x0 = x[0];
129: x1 = x[1];
130: x2 = x[2];
131: x3 = x[3];
132: x += 4;
134: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
135: yy0 += 4;
136: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
137: yy1 += 4;
138: sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
139: yy2 += 4;
140: j -= 4;
141: }
142: z[0] = sum0;
143: z[1] = sum1;
144: z[2] = sum2;
145: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
146: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
147: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
148: break;
149: case 2:
150: PetscCall(VecGetArrayRead(yy[0], &yy0));
151: PetscCall(VecGetArrayRead(yy[1], &yy1));
152: switch (j_rem = j & 0x3) {
153: case 3:
154: x2 = x[2];
155: sum0 += x2 * PetscConj(yy0[2]);
156: sum1 += x2 * PetscConj(yy1[2]); /* fall through */
157: case 2:
158: x1 = x[1];
159: sum0 += x1 * PetscConj(yy0[1]);
160: sum1 += x1 * PetscConj(yy1[1]); /* fall through */
161: case 1:
162: x0 = x[0];
163: sum0 += x0 * PetscConj(yy0[0]);
164: sum1 += x0 * PetscConj(yy1[0]); /* fall through */
165: case 0:
166: x += j_rem;
167: yy0 += j_rem;
168: yy1 += j_rem;
169: j -= j_rem;
170: break;
171: }
172: while (j > 0) {
173: x0 = x[0];
174: x1 = x[1];
175: x2 = x[2];
176: x3 = x[3];
177: x += 4;
179: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
180: yy0 += 4;
181: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
182: yy1 += 4;
183: j -= 4;
184: }
185: z[0] = sum0;
186: z[1] = sum1;
188: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
189: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
190: break;
191: case 1:
192: PetscCall(VecGetArrayRead(yy[0], &yy0));
193: switch (j_rem = j & 0x3) {
194: case 3:
195: x2 = x[2];
196: sum0 += x2 * PetscConj(yy0[2]); /* fall through */
197: case 2:
198: x1 = x[1];
199: sum0 += x1 * PetscConj(yy0[1]); /* fall through */
200: case 1:
201: x0 = x[0];
202: sum0 += x0 * PetscConj(yy0[0]); /* fall through */
203: case 0:
204: x += j_rem;
205: yy0 += j_rem;
206: j -= j_rem;
207: break;
208: }
209: while (j > 0) {
210: sum0 += x[0] * PetscConj(yy0[0]) + x[1] * PetscConj(yy0[1]) + x[2] * PetscConj(yy0[2]) + x[3] * PetscConj(yy0[3]);
211: yy0 += 4;
212: j -= 4;
213: x += 4;
214: }
215: z[0] = sum0;
217: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
218: break;
219: case 0:
220: break;
221: }
222: z += nv_rem;
223: i -= nv_rem;
224: yy += nv_rem;
226: while (i > 0) {
227: sum0 = 0.;
228: sum1 = 0.;
229: sum2 = 0.;
230: sum3 = 0.;
231: PetscCall(VecGetArrayRead(yy[0], &yy0));
232: PetscCall(VecGetArrayRead(yy[1], &yy1));
233: PetscCall(VecGetArrayRead(yy[2], &yy2));
234: PetscCall(VecGetArrayRead(yy[3], &yy3));
236: j = n;
237: x = xbase;
238: switch (j_rem = j & 0x3) {
239: case 3:
240: x2 = x[2];
241: sum0 += x2 * PetscConj(yy0[2]);
242: sum1 += x2 * PetscConj(yy1[2]);
243: sum2 += x2 * PetscConj(yy2[2]);
244: sum3 += x2 * PetscConj(yy3[2]); /* fall through */
245: case 2:
246: x1 = x[1];
247: sum0 += x1 * PetscConj(yy0[1]);
248: sum1 += x1 * PetscConj(yy1[1]);
249: sum2 += x1 * PetscConj(yy2[1]);
250: sum3 += x1 * PetscConj(yy3[1]); /* fall through */
251: case 1:
252: x0 = x[0];
253: sum0 += x0 * PetscConj(yy0[0]);
254: sum1 += x0 * PetscConj(yy1[0]);
255: sum2 += x0 * PetscConj(yy2[0]);
256: sum3 += x0 * PetscConj(yy3[0]); /* fall through */
257: case 0:
258: x += j_rem;
259: yy0 += j_rem;
260: yy1 += j_rem;
261: yy2 += j_rem;
262: yy3 += j_rem;
263: j -= j_rem;
264: break;
265: }
266: while (j > 0) {
267: x0 = x[0];
268: x1 = x[1];
269: x2 = x[2];
270: x3 = x[3];
271: x += 4;
273: sum0 += x0 * PetscConj(yy0[0]) + x1 * PetscConj(yy0[1]) + x2 * PetscConj(yy0[2]) + x3 * PetscConj(yy0[3]);
274: yy0 += 4;
275: sum1 += x0 * PetscConj(yy1[0]) + x1 * PetscConj(yy1[1]) + x2 * PetscConj(yy1[2]) + x3 * PetscConj(yy1[3]);
276: yy1 += 4;
277: sum2 += x0 * PetscConj(yy2[0]) + x1 * PetscConj(yy2[1]) + x2 * PetscConj(yy2[2]) + x3 * PetscConj(yy2[3]);
278: yy2 += 4;
279: sum3 += x0 * PetscConj(yy3[0]) + x1 * PetscConj(yy3[1]) + x2 * PetscConj(yy3[2]) + x3 * PetscConj(yy3[3]);
280: yy3 += 4;
281: j -= 4;
282: }
283: z[0] = sum0;
284: z[1] = sum1;
285: z[2] = sum2;
286: z[3] = sum3;
287: z += 4;
288: i -= 4;
289: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
290: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
291: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
292: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
293: yy += 4;
294: }
295: PetscCall(VecRestoreArrayRead(xin, &xbase));
296: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
297: PetscFunctionReturn(PETSC_SUCCESS);
298: }
299: #endif
301: PetscErrorCode VecMTDot_Seq(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
302: {
303: const PetscInt n = xin->map->n;
304: PetscInt i = nv, j = n, nv_rem = nv & 0x3, j_rem;
305: PetscScalar sum0 = 0., sum1 = 0., sum2 = 0., sum3, x0, x1, x2, x3;
306: const PetscScalar *yy0, *yy1, *yy2, *yy3, *x, *xbase;
307: const Vec *yy = (Vec *)yin;
309: PetscFunctionBegin;
310: PetscCall(VecGetArrayRead(xin, &xbase));
311: x = xbase;
313: switch (nv_rem) {
314: case 3:
315: PetscCall(VecGetArrayRead(yy[0], &yy0));
316: PetscCall(VecGetArrayRead(yy[1], &yy1));
317: PetscCall(VecGetArrayRead(yy[2], &yy2));
318: switch (j_rem = j & 0x3) {
319: case 3:
320: x2 = x[2];
321: sum0 += x2 * yy0[2];
322: sum1 += x2 * yy1[2];
323: sum2 += x2 * yy2[2]; /* fall through */
324: case 2:
325: x1 = x[1];
326: sum0 += x1 * yy0[1];
327: sum1 += x1 * yy1[1];
328: sum2 += x1 * yy2[1]; /* fall through */
329: case 1:
330: x0 = x[0];
331: sum0 += x0 * yy0[0];
332: sum1 += x0 * yy1[0];
333: sum2 += x0 * yy2[0]; /* fall through */
334: case 0:
335: x += j_rem;
336: yy0 += j_rem;
337: yy1 += j_rem;
338: yy2 += j_rem;
339: j -= j_rem;
340: break;
341: }
342: while (j > 0) {
343: x0 = x[0];
344: x1 = x[1];
345: x2 = x[2];
346: x3 = x[3];
347: x += 4;
349: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
350: yy0 += 4;
351: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
352: yy1 += 4;
353: sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
354: yy2 += 4;
355: j -= 4;
356: }
357: z[0] = sum0;
358: z[1] = sum1;
359: z[2] = sum2;
360: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
361: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
362: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
363: break;
364: case 2:
365: PetscCall(VecGetArrayRead(yy[0], &yy0));
366: PetscCall(VecGetArrayRead(yy[1], &yy1));
367: switch (j_rem = j & 0x3) {
368: case 3:
369: x2 = x[2];
370: sum0 += x2 * yy0[2];
371: sum1 += x2 * yy1[2]; /* fall through */
372: case 2:
373: x1 = x[1];
374: sum0 += x1 * yy0[1];
375: sum1 += x1 * yy1[1]; /* fall through */
376: case 1:
377: x0 = x[0];
378: sum0 += x0 * yy0[0];
379: sum1 += x0 * yy1[0]; /* fall through */
380: case 0:
381: x += j_rem;
382: yy0 += j_rem;
383: yy1 += j_rem;
384: j -= j_rem;
385: break;
386: }
387: while (j > 0) {
388: x0 = x[0];
389: x1 = x[1];
390: x2 = x[2];
391: x3 = x[3];
392: x += 4;
394: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
395: yy0 += 4;
396: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
397: yy1 += 4;
398: j -= 4;
399: }
400: z[0] = sum0;
401: z[1] = sum1;
403: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
404: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
405: break;
406: case 1:
407: PetscCall(VecGetArrayRead(yy[0], &yy0));
408: switch (j_rem = j & 0x3) {
409: case 3:
410: x2 = x[2];
411: sum0 += x2 * yy0[2]; /* fall through */
412: case 2:
413: x1 = x[1];
414: sum0 += x1 * yy0[1]; /* fall through */
415: case 1:
416: x0 = x[0];
417: sum0 += x0 * yy0[0]; /* fall through */
418: case 0:
419: x += j_rem;
420: yy0 += j_rem;
421: j -= j_rem;
422: break;
423: }
424: while (j > 0) {
425: sum0 += x[0] * yy0[0] + x[1] * yy0[1] + x[2] * yy0[2] + x[3] * yy0[3];
426: yy0 += 4;
427: j -= 4;
428: x += 4;
429: }
430: z[0] = sum0;
432: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
433: break;
434: case 0:
435: break;
436: }
437: z += nv_rem;
438: i -= nv_rem;
439: yy += nv_rem;
441: while (i > 0) {
442: sum0 = 0.;
443: sum1 = 0.;
444: sum2 = 0.;
445: sum3 = 0.;
446: PetscCall(VecGetArrayRead(yy[0], &yy0));
447: PetscCall(VecGetArrayRead(yy[1], &yy1));
448: PetscCall(VecGetArrayRead(yy[2], &yy2));
449: PetscCall(VecGetArrayRead(yy[3], &yy3));
450: x = xbase;
452: j = n;
453: switch (j_rem = j & 0x3) {
454: case 3:
455: x2 = x[2];
456: sum0 += x2 * yy0[2];
457: sum1 += x2 * yy1[2];
458: sum2 += x2 * yy2[2];
459: sum3 += x2 * yy3[2]; /* fall through */
460: case 2:
461: x1 = x[1];
462: sum0 += x1 * yy0[1];
463: sum1 += x1 * yy1[1];
464: sum2 += x1 * yy2[1];
465: sum3 += x1 * yy3[1]; /* fall through */
466: case 1:
467: x0 = x[0];
468: sum0 += x0 * yy0[0];
469: sum1 += x0 * yy1[0];
470: sum2 += x0 * yy2[0];
471: sum3 += x0 * yy3[0]; /* fall through */
472: case 0:
473: x += j_rem;
474: yy0 += j_rem;
475: yy1 += j_rem;
476: yy2 += j_rem;
477: yy3 += j_rem;
478: j -= j_rem;
479: break;
480: }
481: while (j > 0) {
482: x0 = x[0];
483: x1 = x[1];
484: x2 = x[2];
485: x3 = x[3];
486: x += 4;
488: sum0 += x0 * yy0[0] + x1 * yy0[1] + x2 * yy0[2] + x3 * yy0[3];
489: yy0 += 4;
490: sum1 += x0 * yy1[0] + x1 * yy1[1] + x2 * yy1[2] + x3 * yy1[3];
491: yy1 += 4;
492: sum2 += x0 * yy2[0] + x1 * yy2[1] + x2 * yy2[2] + x3 * yy2[3];
493: yy2 += 4;
494: sum3 += x0 * yy3[0] + x1 * yy3[1] + x2 * yy3[2] + x3 * yy3[3];
495: yy3 += 4;
496: j -= 4;
497: }
498: z[0] = sum0;
499: z[1] = sum1;
500: z[2] = sum2;
501: z[3] = sum3;
502: z += 4;
503: i -= 4;
504: PetscCall(VecRestoreArrayRead(yy[0], &yy0));
505: PetscCall(VecRestoreArrayRead(yy[1], &yy1));
506: PetscCall(VecRestoreArrayRead(yy[2], &yy2));
507: PetscCall(VecRestoreArrayRead(yy[3], &yy3));
508: yy += 4;
509: }
510: PetscCall(VecRestoreArrayRead(xin, &xbase));
511: PetscCall(PetscLogFlops(PetscMax(nv * (2.0 * n - 1), 0.0)));
512: PetscFunctionReturn(PETSC_SUCCESS);
513: }
515: static PetscErrorCode VecMultiDot_Seq_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
516: {
517: PetscInt i, j, nfail;
518: const PetscScalar *xarray, *yarray, *yfirst, *ynext;
519: PetscBool stop = PETSC_FALSE;
520: const char *trans = conjugate ? "C" : "T";
521: PetscInt64 lda = 0;
522: PetscBLASInt n, m;
524: PetscFunctionBegin;
525: if (xin->map->n == 0) { // otherwise BLAS complains illegal values (0) for lda
526: PetscCall(PetscArrayzero(z, nv));
527: PetscFunctionReturn(PETSC_SUCCESS);
528: }
530: // caller guarantees xin->map->n <= PETSC_BLAS_INT_MAX
531: PetscCall(PetscBLASIntCast(xin->map->n, &n));
532: PetscCall(VecGetArrayRead(xin, &xarray));
533: i = nfail = 0;
534: while (i < nv) {
535: stop = PETSC_FALSE;
536: PetscCall(VecGetArrayRead(yin[i], &yfirst));
537: yarray = yfirst;
538: for (j = i + 1; j < nv; j++) {
539: PetscCall(VecGetArrayRead(yin[j], &ynext));
540: if (j == i + 1) {
541: lda = ynext - yarray;
542: if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
543: } else if (lda * (j - i) != ynext - yarray) { // not in the same stride? if so, stop here
544: stop = PETSC_TRUE;
545: }
546: PetscCall(VecRestoreArrayRead(yin[j], &ynext));
547: if (stop) break;
548: }
549: PetscCall(VecRestoreArrayRead(yin[i], &yfirst));
551: // we found m vectors yin[i..j)
552: PetscCall(PetscBLASIntCast(j - i, &m));
553: if (m > 1) {
554: PetscBLASInt ione = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
555: PetscScalar one = 1, zero = 0;
557: PetscCallBLAS("BLASgemv", BLASgemv_(trans, &n, &m, &one, yarray, &lda2, xarray, &ione, &zero, z + i, &ione));
558: PetscCall(PetscLogFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
559: } else {
560: if (nfail == 0) {
561: if (conjugate) PetscCall(VecDot_Seq(xin, yin[i], z + i));
562: else PetscCall(VecTDot_Seq(xin, yin[i], z + i));
563: nfail++;
564: } else break;
565: }
566: i = j;
567: }
568: PetscCall(VecRestoreArrayRead(xin, &xarray));
569: if (i < nv) { // finish the remaining if any
570: if (conjugate) PetscCall(VecMDot_Seq(xin, nv - i, yin + i, z + i));
571: else PetscCall(VecMTDot_Seq(xin, nv - i, yin + i, z + i));
572: }
573: PetscFunctionReturn(PETSC_SUCCESS);
574: }
576: PetscErrorCode VecMDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
577: {
578: PetscFunctionBegin;
579: if (xin->map->n > PETSC_BLAS_INT_MAX) PetscCall(VecMDot_Seq(xin, nv, yin, z));
580: else PetscCall(VecMultiDot_Seq_GEMV(PETSC_TRUE, xin, nv, yin, z));
581: PetscFunctionReturn(PETSC_SUCCESS);
582: }
584: PetscErrorCode VecMTDot_Seq_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
585: {
586: PetscFunctionBegin;
587: if (xin->map->n > PETSC_BLAS_INT_MAX) PetscCall(VecMTDot_Seq(xin, nv, yin, z));
588: else PetscCall(VecMultiDot_Seq_GEMV(PETSC_FALSE, xin, nv, yin, z));
589: PetscFunctionReturn(PETSC_SUCCESS);
590: }
592: static PetscErrorCode VecMinMax_Seq(Vec xin, PetscInt *idx, PetscReal *z, PetscReal minmax, int (*const cmp)(PetscReal, PetscReal))
593: {
594: const PetscInt n = xin->map->n;
595: PetscInt j = -1;
597: PetscFunctionBegin;
598: if (n) {
599: const PetscScalar *xx;
601: PetscCall(VecGetArrayRead(xin, &xx));
602: minmax = PetscRealPart(xx[(j = 0)]);
603: for (PetscInt i = 1; i < n; ++i) {
604: const PetscReal tmp = PetscRealPart(xx[i]);
606: if (cmp(tmp, minmax)) {
607: j = i;
608: minmax = tmp;
609: }
610: }
611: PetscCall(VecRestoreArrayRead(xin, &xx));
612: }
613: *z = minmax;
614: if (idx) *idx = j;
615: PetscFunctionReturn(PETSC_SUCCESS);
616: }
618: static int VecMax_Seq_GT(PetscReal l, PetscReal r)
619: {
620: return l > r;
621: }
623: PetscErrorCode VecMax_Seq(Vec xin, PetscInt *idx, PetscReal *z)
624: {
625: PetscFunctionBegin;
626: PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MIN_REAL, VecMax_Seq_GT));
627: PetscFunctionReturn(PETSC_SUCCESS);
628: }
630: static int VecMin_Seq_LT(PetscReal l, PetscReal r)
631: {
632: return l < r;
633: }
635: PetscErrorCode VecMin_Seq(Vec xin, PetscInt *idx, PetscReal *z)
636: {
637: PetscFunctionBegin;
638: PetscCall(VecMinMax_Seq(xin, idx, z, PETSC_MAX_REAL, VecMin_Seq_LT));
639: PetscFunctionReturn(PETSC_SUCCESS);
640: }
642: PetscErrorCode VecSet_Seq(Vec xin, PetscScalar alpha)
643: {
644: const PetscInt n = xin->map->n;
645: PetscScalar *xx;
647: PetscFunctionBegin;
648: PetscCall(VecGetArrayWrite(xin, &xx));
649: if (alpha == (PetscScalar)0.0) {
650: PetscCall(PetscArrayzero(xx, n));
651: } else {
652: for (PetscInt i = 0; i < n; i++) xx[i] = alpha;
653: }
654: PetscCall(VecRestoreArrayWrite(xin, &xx));
655: PetscFunctionReturn(PETSC_SUCCESS);
656: }
658: PetscErrorCode VecMAXPY_Seq(Vec xin, PetscInt nv, const PetscScalar *alpha, Vec *y)
659: {
660: const PetscInt j_rem = nv & 0x3, n = xin->map->n;
661: const PetscScalar *yptr[4];
662: PetscScalar *xx;
663: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
664: #pragma disjoint(*xx, **yptr, *aptr)
665: #endif
667: PetscFunctionBegin;
668: PetscCall(PetscLogFlops(nv * 2.0 * n));
669: PetscCall(VecGetArray(xin, &xx));
670: for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
671: switch (j_rem) {
672: case 3:
673: PetscKernelAXPY3(xx, alpha[0], alpha[1], alpha[2], yptr[0], yptr[1], yptr[2], n);
674: break;
675: case 2:
676: PetscKernelAXPY2(xx, alpha[0], alpha[1], yptr[0], yptr[1], n);
677: break;
678: case 1:
679: PetscKernelAXPY(xx, alpha[0], yptr[0], n);
680: default:
681: break;
682: }
683: for (PetscInt i = 0; i < j_rem; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
684: alpha += j_rem;
685: y += j_rem;
686: for (PetscInt j = j_rem, inc = 4; j < nv; j += inc, alpha += inc, y += inc) {
687: for (PetscInt i = 0; i < inc; ++i) PetscCall(VecGetArrayRead(y[i], yptr + i));
688: PetscKernelAXPY4(xx, alpha[0], alpha[1], alpha[2], alpha[3], yptr[0], yptr[1], yptr[2], yptr[3], n);
689: for (PetscInt i = 0; i < inc; ++i) PetscCall(VecRestoreArrayRead(y[i], yptr + i));
690: }
691: PetscCall(VecRestoreArray(xin, &xx));
692: PetscFunctionReturn(PETSC_SUCCESS);
693: }
695: /* y = y + sum alpha[i] x[i] */
696: PetscErrorCode VecMAXPY_Seq_GEMV(Vec yin, PetscInt nv, const PetscScalar alpha[], Vec xin[])
697: {
698: PetscInt i, j, nfail;
699: PetscScalar *yarray;
700: const PetscScalar *xfirst, *xnext, *xarray;
701: PetscBool stop = PETSC_FALSE;
702: PetscInt64 lda = 0;
703: PetscBLASInt n, m;
705: PetscFunctionBegin;
706: if (yin->map->n == 0 || yin->map->n > PETSC_BLAS_INT_MAX) {
707: PetscCall(VecMAXPY_Seq(yin, nv, alpha, xin));
708: PetscFunctionReturn(PETSC_SUCCESS);
709: }
711: PetscCall(PetscBLASIntCast(yin->map->n, &n));
712: PetscCall(VecGetArray(yin, &yarray));
713: i = nfail = 0;
714: while (i < nv) {
715: stop = PETSC_FALSE;
716: PetscCall(VecGetArrayRead(xin[i], &xfirst));
717: xarray = xfirst;
718: for (j = i + 1; j < nv; j++) {
719: PetscCall(VecGetArrayRead(xin[j], &xnext));
720: if (j == i + 1) {
721: lda = xnext - xarray;
722: if (lda < 0 || lda > PETSC_BLAS_INT_MAX || lda - n > 64) stop = PETSC_TRUE;
723: } else if (lda * (j - i) != xnext - xarray) { // not in the same stride? if so, stop here
724: stop = PETSC_TRUE;
725: }
726: PetscCall(VecRestoreArrayRead(xin[j], &xnext));
727: if (stop) break;
728: }
729: PetscCall(VecRestoreArrayRead(xin[i], &xfirst));
731: PetscCall(PetscBLASIntCast(j - i, &m));
732: if (m > 1) {
733: PetscBLASInt incx = 1, incy = 1, lda2 = (PetscBLASInt)lda; // the cast is safe since we've screened out those lda > PETSC_BLAS_INT_MAX above
734: PetscScalar one = 1;
735: PetscCallBLAS("BLASgemv", BLASgemv_("N", &n, &m, &one, xarray, &lda2, alpha + i, &incx, &one, yarray, &incy));
736: PetscCall(PetscLogFlops(m * 2.0 * n));
737: } else {
738: // we only allow falling back on VecAXPY once
739: if (nfail++ == 0) PetscCall(VecAXPY_Seq(yin, alpha[i], xin[i]));
740: else break; // break the while loop
741: }
742: i = j;
743: }
744: PetscCall(VecRestoreArray(yin, &yarray));
746: // finish the remaining if any
747: if (i < nv) PetscCall(VecMAXPY_Seq(yin, nv - i, alpha + i, xin + i));
748: PetscFunctionReturn(PETSC_SUCCESS);
749: }
751: #include <../src/vec/vec/impls/seq/ftn-kernels/faypx.h>
753: PetscErrorCode VecAYPX_Seq(Vec yin, PetscScalar alpha, Vec xin)
754: {
755: PetscFunctionBegin;
756: if (alpha == (PetscScalar)0.0) {
757: PetscCall(VecCopy(xin, yin));
758: } else if (alpha == (PetscScalar)1.0) {
759: PetscCall(VecAXPY_Seq(yin, alpha, xin));
760: } else {
761: const PetscInt n = yin->map->n;
762: const PetscScalar *xx;
763: PetscScalar *yy;
765: PetscCall(VecGetArrayRead(xin, &xx));
766: PetscCall(VecGetArray(yin, &yy));
767: if (alpha == (PetscScalar)-1.0) {
768: for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] - yy[i];
769: PetscCall(PetscLogFlops(n));
770: } else {
771: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
772: fortranaypx_(&n, &alpha, xx, yy);
773: #else
774: for (PetscInt i = 0; i < n; ++i) yy[i] = xx[i] + alpha * yy[i];
775: #endif
776: PetscCall(PetscLogFlops(2 * n));
777: }
778: PetscCall(VecRestoreArrayRead(xin, &xx));
779: PetscCall(VecRestoreArray(yin, &yy));
780: }
781: PetscFunctionReturn(PETSC_SUCCESS);
782: }
784: #include <../src/vec/vec/impls/seq/ftn-kernels/fwaxpy.h>
785: /*
786: IBM ESSL contains a routine dzaxpy() that is our WAXPY() but it appears
787: to be slower than a regular C loop. Hence,we do not include it.
788: void ?zaxpy(int*,PetscScalar*,PetscScalar*,int*,PetscScalar*,int*,PetscScalar*,int*);
789: */
791: PetscErrorCode VecWAXPY_Seq(Vec win, PetscScalar alpha, Vec xin, Vec yin)
792: {
793: const PetscInt n = win->map->n;
794: const PetscScalar *yy, *xx;
795: PetscScalar *ww;
797: PetscFunctionBegin;
798: PetscCall(VecGetArrayRead(xin, &xx));
799: PetscCall(VecGetArrayRead(yin, &yy));
800: PetscCall(VecGetArray(win, &ww));
801: if (alpha == (PetscScalar)1.0) {
802: PetscCall(PetscLogFlops(n));
803: /* could call BLAS axpy after call to memcopy, but may be slower */
804: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + xx[i];
805: } else if (alpha == (PetscScalar)-1.0) {
806: PetscCall(PetscLogFlops(n));
807: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] - xx[i];
808: } else if (alpha == (PetscScalar)0.0) {
809: PetscCall(PetscArraycpy(ww, yy, n));
810: } else {
811: PetscCall(PetscLogFlops(2.0 * n));
812: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
813: fortranwaxpy_(&n, &alpha, xx, yy, ww);
814: #else
815: for (PetscInt i = 0; i < n; i++) ww[i] = yy[i] + alpha * xx[i];
816: #endif
817: }
818: PetscCall(VecRestoreArrayRead(xin, &xx));
819: PetscCall(VecRestoreArrayRead(yin, &yy));
820: PetscCall(VecRestoreArray(win, &ww));
821: PetscFunctionReturn(PETSC_SUCCESS);
822: }
824: PetscErrorCode VecMaxPointwiseDivide_Seq(Vec xin, Vec yin, PetscReal *max)
825: {
826: const PetscInt n = xin->map->n;
827: const PetscScalar *xx, *yy;
828: PetscReal m = 0.0;
830: PetscFunctionBegin;
831: PetscCall(VecGetArrayRead(xin, &xx));
832: PetscCall(VecGetArrayRead(yin, &yy));
833: for (PetscInt i = 0; i < n; ++i) {
834: const PetscReal v = PetscAbsScalar(yy[i] == (PetscScalar)0.0 ? xx[i] : xx[i] / yy[i]);
836: // use a separate value to not re-evaluate side-effects
837: m = PetscMax(v, m);
838: }
839: PetscCall(VecRestoreArrayRead(xin, &xx));
840: PetscCall(VecRestoreArrayRead(yin, &yy));
841: PetscCall(PetscLogFlops(n));
842: *max = m;
843: PetscFunctionReturn(PETSC_SUCCESS);
844: }
846: PetscErrorCode VecPlaceArray_Seq(Vec vin, const PetscScalar *a)
847: {
848: Vec_Seq *v = (Vec_Seq *)vin->data;
850: PetscFunctionBegin;
851: PetscCheck(!v->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
852: v->unplacedarray = v->array; /* save previous array so reset can bring it back */
853: v->array = (PetscScalar *)a;
854: PetscFunctionReturn(PETSC_SUCCESS);
855: }
857: PetscErrorCode VecReplaceArray_Seq(Vec vin, const PetscScalar *a)
858: {
859: Vec_Seq *v = (Vec_Seq *)vin->data;
861: PetscFunctionBegin;
862: PetscCall(PetscFree(v->array_allocated));
863: v->array_allocated = v->array = (PetscScalar *)a;
864: PetscFunctionReturn(PETSC_SUCCESS);
865: }