@@ -28,6 +28,14 @@ transform_and_filter(const double * restrict data, size_t * restrict npoints_p,
2828 dimension_t dim , const double * restrict ref ,
2929 const bool * restrict maximise )
3030{
31+ /* FIXME: This function is slow with 1M points. Some ideas for improvement:
32+
33+ - [ ] Skip ref - points if ref[] = { 0 }.
34+ - [ ] If all minimised or all maximised, do not check maximise[k].
35+ - [ ] GCC fails to vectorize any loops in this function. Process points in blocks
36+ for vectorization.
37+ - [ ] Maybe transpose the matrix so that each column can be processed independently.
38+ */
3139 size_t npoints = * npoints_p ;
3240 double * points = malloc (dim * npoints * sizeof (* points ));
3341 size_t i , j ;
@@ -53,18 +61,47 @@ transform_and_filter(const double * restrict data, size_t * restrict npoints_p,
5361 return points ;
5462}
5563
64+ __attribute__((hot ))
5665_attr_optimize_finite_math // Required so that GCC will vectorize the inner loop.
5766static double
5867get_expected_value (const double * restrict points , size_t npoints ,
5968 dimension_t dim , const double * restrict w )
6069{
61- ASSUME (1 <= dim && dim <= MOOCORE_DIMENSION_MAX );
70+ ASSUME (2 <= dim && dim <= MOOCORE_DIMENSION_MAX );
6271 ASSUME (npoints > 0 );
72+ /* This function is the most expensive for large number of points, thus we
73+ help the compiler vectorize it by processing blocks of 16 points at a
74+ time. */
75+ /* FIXME: A further improvement could be to use a transposed points matrix,
76+ but the computational savings would need to be measured. */
77+ enum { BLOCK_SIZE = 16 };
6378 // points >= 0 && w >=0 so max_s_w cannot be < 0.
64- double max_s_w = - INFINITY ;
65- for (size_t i = 0 ; i < npoints ; i ++ ) {
79+ double max_s_w = 0 ;
80+ size_t i = 0 ;
81+ for (; i + BLOCK_SIZE <= npoints ; i += BLOCK_SIZE ) {
82+ double min_ratio [BLOCK_SIZE ];
83+ for (size_t j = 0 ; j < BLOCK_SIZE ; j ++ )
84+ min_ratio [j ] = INFINITY ;
85+
86+ const double * restrict base = points + i * dim ;
87+ for (dimension_t k = 0 ; k < dim ; k ++ ) {
88+ double w_k = w [k ];
89+ const double * restrict pk = base + k ;
90+ for (size_t j = 0 ; j < BLOCK_SIZE ; j ++ ) {
91+ double ratio = pk [j * dim ] * w_k ;
92+ min_ratio [j ] = MIN (ratio , min_ratio [j ]);
93+ }
94+ }
95+
96+ for (size_t j = 0 ; j < BLOCK_SIZE ; j ++ )
97+ max_s_w = MAX (max_s_w , min_ratio [j ]);
98+ }
99+ // Scalar tail.
100+ for (; i < npoints ; i ++ ) {
66101 const double * restrict p = points + i * dim ;
67102 double min_ratio = p [0 ] * w [0 ];
103+ if (likely (min_ratio <= max_s_w ))
104+ continue ;
68105 for (dimension_t k = 1 ; k < dim ; k ++ ) {
69106 double ratio = p [k ] * w [k ];
70107 min_ratio = MIN (min_ratio , ratio );
0 commit comments