Skip to content

Commit c3d78df

Browse files
committed
* hvapprox.c (get_expected_value): Process blocks of 16 points.
1 parent 607edb5 commit c3d78df

1 file changed

Lines changed: 40 additions & 3 deletions

File tree

c/hvapprox.c

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ transform_and_filter(const double * restrict data, size_t * restrict npoints_p,
2828
dimension_t dim, const double * restrict ref,
2929
const bool * restrict maximise)
3030
{
31+
/* FIXME: This function is slow with 1M points. Some ideas for improvement:
32+
33+
- [ ] Skip ref - points if ref[] = { 0 }.
34+
- [ ] If all minimised or all maximised, do not check maximise[k].
35+
- [ ] GCC fails to vectorize any loops in this function. Process points in blocks
36+
for vectorization.
37+
- [ ] Maybe transpose the matrix so that each column can be processed independently.
38+
*/
3139
size_t npoints = *npoints_p;
3240
double * points = malloc(dim * npoints * sizeof(*points));
3341
size_t i, j;
@@ -53,18 +61,47 @@ transform_and_filter(const double * restrict data, size_t * restrict npoints_p,
5361
return points;
5462
}
5563

64+
__attribute__((hot))
5665
_attr_optimize_finite_math // Required so that GCC will vectorize the inner loop.
5766
static double
5867
get_expected_value(const double * restrict points, size_t npoints,
5968
dimension_t dim, const double * restrict w)
6069
{
61-
ASSUME(1 <= dim && dim <= MOOCORE_DIMENSION_MAX);
70+
ASSUME(2 <= dim && dim <= MOOCORE_DIMENSION_MAX);
6271
ASSUME(npoints > 0);
72+
/* This function is the most expensive for large number of points, thus we
73+
help the compiler vectorize it by processing blocks of 16 points at a
74+
time. */
75+
/* FIXME: A further improvement could be to use a transposed points matrix,
76+
but the computational savings would need to be measured. */
77+
enum { BLOCK_SIZE = 16 };
6378
// points >= 0 && w >=0 so max_s_w cannot be < 0.
64-
double max_s_w = -INFINITY;
65-
for (size_t i = 0; i < npoints; i++) {
79+
double max_s_w = 0;
80+
size_t i = 0;
81+
for (; i + BLOCK_SIZE <= npoints; i += BLOCK_SIZE) {
82+
double min_ratio[BLOCK_SIZE];
83+
for (size_t j = 0; j < BLOCK_SIZE; j++)
84+
min_ratio[j] = INFINITY;
85+
86+
const double * restrict base = points + i * dim;
87+
for (dimension_t k = 0; k < dim; k++) {
88+
double w_k = w[k];
89+
const double * restrict pk = base + k;
90+
for (size_t j = 0; j < BLOCK_SIZE; j++) {
91+
double ratio = pk[j * dim] * w_k;
92+
min_ratio[j] = MIN(ratio, min_ratio[j]);
93+
}
94+
}
95+
96+
for (size_t j = 0; j < BLOCK_SIZE; j++)
97+
max_s_w = MAX(max_s_w, min_ratio[j]);
98+
}
99+
// Scalar tail.
100+
for (; i < npoints; i++) {
66101
const double * restrict p = points + i * dim;
67102
double min_ratio = p[0] * w[0];
103+
if (likely(min_ratio <= max_s_w))
104+
continue;
68105
for (dimension_t k = 1; k < dim; k++) {
69106
double ratio = p[k] * w[k];
70107
min_ratio = MIN(min_ratio, ratio);

0 commit comments

Comments
 (0)