@@ -336,33 +336,21 @@ class fork_join_scheduler {
336336#pragma warning(disable: 4267) // conversion from 'size_t' to *, possible loss of data
337337#endif
338338
339- template <typename F>
340- size_t get_granularity (size_t start, size_t end, F f) {
341- size_t done = 0 ;
342- size_t sz = 1 ;
343- int ticks = 0 ;
344- do {
345- sz = std::min (sz, end - (start + done));
346- auto tstart = std::chrono::high_resolution_clock::now ();
347- for (size_t i = 0 ; i < sz; i++) f (start + done + i);
348- auto tstop = std::chrono::high_resolution_clock::now ();
349- ticks = static_cast <int >((tstop - tstart).count ());
350- done += sz;
351- sz *= 2 ;
352- } while (ticks < 1000 && done < (end - start));
353- return done;
354- }
355-
356339 template <typename F>
357340 void parfor (size_t start, size_t end, F f, size_t granularity = 0 ,
358341 bool conservative = false ) {
359342 if (end <= start) return ;
343+ size_t n = end - start;
360344 if (granularity == 0 ) {
361- size_t done = get_granularity (start, end, f);
362- granularity = std::max (done, (end - start) / (128 * sched->num_threads ));
363- parfor_ (start + done, end, f, granularity, conservative);
364- } else
365- parfor_ (start, end, f, granularity, conservative);
345+ // Aim for ~4 chunks per thread to balance load without excessive splitting.
346+ granularity = std::max<size_t >(1 , n / (4 * sched->num_threads ));
347+ }
348+ // Sequential fast-path: skip task machinery for small ranges
349+ if (n <= granularity) {
350+ for (size_t i = start; i < end; i++) f (i);
351+ return ;
352+ }
353+ parfor_ (start, end, f, granularity, conservative);
366354 }
367355
368356 private:
0 commit comments