Skip to content

Commit 83fd8ab

Browse files
committed
rewrite fitness mechanism
1 parent 386c23a commit 83fd8ab

File tree

8 files changed

+122
-93
lines changed

8 files changed

+122
-93
lines changed

Include/internal/pycore_interp_structs.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,6 @@ typedef struct _PyOptimizationConfig {
451451

452452
// Trace fitness thresholds
453453
uint16_t fitness_initial;
454-
uint16_t fitness_initial_side;
455454

456455
// Optimization flags
457456
bool specialization_enabled;

Include/internal/pycore_optimizer.h

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,50 @@ extern "C" {
1515
#include "pycore_optimizer_types.h"
1616
#include <stdbool.h>
1717

18-
/* Default fitness configuration values for trace quality control.
19-
* FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via
20-
* PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */
21-
#define FITNESS_PER_INSTRUCTION 2
22-
#define FITNESS_BRANCH_BASE 5
23-
#define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION)
24-
#define FITNESS_INITIAL_SIDE (FITNESS_INITIAL * 3 / 5)
25-
#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL / 10)
26-
27-
/* Exit quality constants for fitness-based trace termination.
28-
* Higher values mean better places to stop the trace. */
29-
30-
#define EXIT_QUALITY_DEFAULT 200
31-
#define EXIT_QUALITY_CLOSE_LOOP (4 * EXIT_QUALITY_DEFAULT)
32-
#define EXIT_QUALITY_ENTER_EXECUTOR (2 * EXIT_QUALITY_DEFAULT + 100)
33-
#define EXIT_QUALITY_SPECIALIZABLE (EXIT_QUALITY_DEFAULT / 4)
18+
/* Fitness controls how long a trace can grow.
19+
* Starts at FITNESS_INITIAL, then decreases from per-bytecode buffer usage
20+
* plus branch/frame heuristics. The trace stops when fitness drops below the
21+
* current exit_quality.
22+
*
23+
* Design targets for the constants below:
24+
* 1. Reaching the abstract frame-depth limit should drop fitness below
25+
* EXIT_QUALITY_SPECIALIZABLE.
26+
* 2. A backward edge should leave budget for roughly N_BACKWARD_SLACK more
27+
* bytecodes, assuming AVG_SLOTS_PER_INSTRUCTION.
28+
* 3. Roughly seven balanced branches should reduce fitness to
29+
* EXIT_QUALITY_DEFAULT before per-slot costs.
30+
* 4. A push followed by a matching return is net-zero on frame-specific
31+
* fitness, excluding per-slot costs.
32+
*/
33+
#define MAX_TARGET_LENGTH 400
34+
#define OPTIMIZER_EFFECTIVENESS 2
35+
#define FITNESS_INITIAL (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS)
36+
37+
/* Exit quality thresholds: trace stops when fitness < exit_quality.
38+
* Higher = trace is more willing to stop here. */
39+
#define EXIT_QUALITY_CLOSE_LOOP (FITNESS_INITIAL / 2)
40+
#define EXIT_QUALITY_ENTER_EXECUTOR (FITNESS_INITIAL * 3 / 8)
41+
#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 8)
42+
#define EXIT_QUALITY_SPECIALIZABLE (FITNESS_INITIAL / 80)
43+
44+
/* Estimated buffer slots per bytecode, used only to derive heuristics.
45+
* Runtime charging uses trace-buffer capacity consumed for each bytecode. */
46+
#define AVG_SLOTS_PER_INSTRUCTION 6
47+
48+
/* Heuristic backward-edge penalty: leave room for about
49+
* N_BACKWARD_SLACK more bytecodes before reaching EXIT_QUALITY_CLOSE_LOOP,
50+
* based on AVG_SLOTS_PER_INSTRUCTION. */
51+
#define N_BACKWARD_SLACK 50
52+
#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL - EXIT_QUALITY_CLOSE_LOOP \
53+
- N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION)
54+
55+
/* Backward edge penalty for JUMP_BACKWARD_NO_INTERRUPT (coroutines/yield-from).
56+
* Smaller than FITNESS_BACKWARD_EDGE since these loops are very short. */
57+
#define FITNESS_BACKWARD_EDGE_COROUTINE (FITNESS_BACKWARD_EDGE / 4)
58+
59+
/* Penalty for a perfectly balanced (50/50) branch.
60+
* 7 such branches (ignoring per-slot cost) exhaust fitness to EXIT_QUALITY_DEFAULT. */
61+
#define FITNESS_BRANCH_BALANCED ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT) / 7)
3462

3563

3664
typedef struct _PyJitUopBuffer {

Lib/test/test_capi/test_opt.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,9 +1358,13 @@ def testfunc(n):
13581358
for _ in gen(n):
13591359
pass
13601360
testfunc(TIER2_THRESHOLD * 2)
1361+
# The generator may be inlined into testfunc's trace,
1362+
# so check whichever executor contains _YIELD_VALUE.
13611363
gen_ex = get_first_executor(gen)
1362-
self.assertIsNotNone(gen_ex)
1363-
uops = get_opnames(gen_ex)
1364+
testfunc_ex = get_first_executor(testfunc)
1365+
ex = gen_ex or testfunc_ex
1366+
self.assertIsNotNone(ex)
1367+
uops = get_opnames(ex)
13641368
self.assertNotIn("_MAKE_HEAP_SAFE", uops)
13651369
self.assertIn("_YIELD_VALUE", uops)
13661370

Modules/_testinternalcapi/test_cases.c.h

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/bytecodes.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6397,7 +6397,13 @@ dummy_func(
63976397
tracer->prev_state.instr_frame = frame;
63986398
tracer->prev_state.instr_oparg = oparg;
63996399
tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL();
6400-
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) {
6400+
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]
6401+
// Branch opcodes use the cache for branch history, not
6402+
// specialization counters. Don't reset it.
6403+
&& opcode != POP_JUMP_IF_FALSE
6404+
&& opcode != POP_JUMP_IF_TRUE
6405+
&& opcode != POP_JUMP_IF_NONE
6406+
&& opcode != POP_JUMP_IF_NOT_NONE) {
64016407
(&next_instr[1])->counter = trigger_backoff_counter();
64026408
}
64036409

Python/generated_cases.c.h

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/optimizer.c

Lines changed: 50 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828

2929
#define MAX_EXECUTORS_SIZE 256
3030

31+
_Static_assert(FITNESS_INITIAL < UOP_MAX_TRACE_LENGTH,
32+
"FITNESS_INITIAL must be less than UOP_MAX_TRACE_LENGTH so that "
33+
"per-slot fitness charging guarantees the buffer never overflows");
34+
3135
// Trace too short, no progress:
3236
// _START_EXECUTOR
3337
// _MAKE_WARM
@@ -596,19 +600,15 @@ add_to_trace(
596600
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
597601

598602

599-
/* Compute branch fitness penalty based on how likely the traced path is.
600-
* The penalty is small when the traced path is common, large when rare.
601-
* A branch that historically goes the other way gets a heavy penalty. */
603+
/* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50,
604+
* 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */
602605
static inline int
603606
compute_branch_penalty(uint16_t history, bool branch_taken)
604607
{
605608
int taken_count = _Py_popcount32((uint32_t)history);
606609
int on_trace_count = branch_taken ? taken_count : 16 - taken_count;
607610
int off_trace = 16 - on_trace_count;
608-
/* Linear scaling: off_trace ranges from 0 (fully biased our way)
609-
* to 16 (fully biased against us), so the penalty ranges from
610-
* FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 32. */
611-
return FITNESS_BRANCH_BASE + off_trace * 2;
611+
return off_trace * FITNESS_BRANCH_BALANCED / 8;
612612
}
613613

614614
/* Compute exit quality for the current trace position.
@@ -630,10 +630,11 @@ compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
630630
return EXIT_QUALITY_DEFAULT;
631631
}
632632

633+
/* Frame penalty: (MAX_ABSTRACT_FRAME_DEPTH-1) pushes exhaust fitness. */
633634
static inline int32_t
634-
compute_frame_penalty(const _PyOptimizationConfig *cfg)
635+
compute_frame_penalty(uint16_t fitness_initial)
635636
{
636-
return (int32_t)cfg->fitness_initial / 30 + 1;
637+
return (int32_t)fitness_initial / (MAX_ABSTRACT_FRAME_DEPTH - 1) + 1;
637638
}
638639

639640
static int
@@ -799,23 +800,26 @@ _PyJit_translate_single_bytecode_to_trace(
799800
return 1;
800801
}
801802

802-
// Fitness-based trace quality check (before reserving space for this instruction)
803+
// Stop the trace if fitness has dropped below the exit quality threshold.
803804
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
804805
int32_t eq = compute_exit_quality(target_instr, opcode, tracer);
805806
DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n",
806807
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth);
807808

808-
// Check if fitness is depleted — should we stop the trace?
809809
if (ts->fitness < eq) {
810-
// This is a tracer heuristic rather than normal program control flow,
811-
// so leave operand1 clear and let the resulting side exit increase chain_depth.
810+
// Heuristic exit: leave operand1=0 so the side exit increments chain_depth.
812811
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
813812
OPT_STAT_INC(fitness_terminated_traces);
814813
DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n",
815814
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq);
816815
goto done;
817816
}
818817

818+
// Snapshot the buffer before reserving tail slots. The later charge
819+
// includes both emitted uops and capacity reserved for exits/deopts/errors.
820+
_PyUOpInstruction *next_before = trace->next;
821+
_PyUOpInstruction *end_before = trace->end;
822+
819823
// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
820824
trace->end -= 2;
821825

@@ -880,29 +884,13 @@ _PyJit_translate_single_bytecode_to_trace(
880884
tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE;
881885
DPRINTF(3, " backward edge penalty: -%d -> fitness=%d\n",
882886
FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness);
883-
_Py_FALLTHROUGH;
887+
break;
884888
case JUMP_BACKWARD_NO_INTERRUPT:
885-
{
886-
if ((next_instr != tracer->initial_state.close_loop_instr) &&
887-
(next_instr != tracer->initial_state.start_instr) &&
888-
uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS &&
889-
// For side exits, we don't want to terminate them early.
890-
tracer->initial_state.exit == NULL &&
891-
// These are coroutines, and we want to unroll those usually.
892-
opcode != JUMP_BACKWARD_NO_INTERRUPT) {
893-
// We encountered a JUMP_BACKWARD but not to the top of our own loop.
894-
// We don't want to continue tracing as we might get stuck in the
895-
// inner loop. Instead, end the trace where the executor of the
896-
// inner loop might start and let the traces rejoin.
897-
OPT_STAT_INC(inner_loop);
898-
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
899-
uop_buffer_last(trace)->operand1 = true; // is_control_flow
900-
DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
901-
tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr);
902-
goto done;
903-
}
889+
tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE_COROUTINE;
890+
DPRINTF(3, " coroutine backward edge penalty: -%d -> fitness=%d\n",
891+
FITNESS_BACKWARD_EDGE_COROUTINE,
892+
tracer->translator_state.fitness);
904893
break;
905-
}
906894

907895
case RESUME:
908896
case RESUME_CHECK:
@@ -1006,32 +994,31 @@ _PyJit_translate_single_bytecode_to_trace(
1006994
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
1007995
ts_depth->frame_depth++;
1008996
assert(ts_depth->frame_depth < MAX_ABSTRACT_FRAME_DEPTH);
1009-
int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config);
1010-
int32_t cost = frame_penalty * ts_depth->frame_depth;
1011-
ts_depth->fitness -= cost;
1012-
DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d (per_frame=%d) -> fitness=%d\n",
1013-
ts_depth->frame_depth, cost, frame_penalty,
997+
int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
998+
ts_depth->fitness -= frame_penalty;
999+
DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d -> fitness=%d\n",
1000+
ts_depth->frame_depth, frame_penalty,
10141001
ts_depth->fitness);
10151002
}
10161003
else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
10171004
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
1018-
int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config);
1005+
int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
10191006
if (ts_depth->frame_depth <= 0) {
1020-
// Underflow: returning from a frame we didn't enter
1021-
ts_depth->fitness -= frame_penalty * 2;
1007+
// Returning from a frame we didn't enter — penalize.
1008+
ts_depth->fitness -= frame_penalty;
10221009
DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n",
1023-
_PyOpcode_uop_name[uop], frame_penalty * 2,
1010+
_PyOpcode_uop_name[uop], frame_penalty,
10241011
ts_depth->fitness);
10251012
}
10261013
else {
1027-
// Reward returning: small inlined calls should be encouraged
1014+
// Symmetric with push: net-zero frame impact.
10281015
ts_depth->fitness += frame_penalty;
1016+
ts_depth->frame_depth--;
10291017
DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n",
10301018
_PyOpcode_uop_name[uop], frame_penalty,
1031-
ts_depth->frame_depth - 1,
1019+
ts_depth->frame_depth,
10321020
ts_depth->fitness);
10331021
}
1034-
ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1;
10351022
}
10361023
else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
10371024
PyObject *recorded_value = tracer->prev_state.recorded_value;
@@ -1074,12 +1061,17 @@ _PyJit_translate_single_bytecode_to_trace(
10741061
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
10751062
goto done;
10761063
}
1077-
// Update fitness AFTER translation, BEFORE returning to continue tracing.
1078-
// This ensures the next iteration's fitness check reflects the cost of
1079-
// all instructions translated so far.
1080-
tracer->translator_state.fitness -= FITNESS_PER_INSTRUCTION;
1081-
DPRINTF(3, " per-insn cost: -%d -> fitness=%d\n",
1082-
FITNESS_PER_INSTRUCTION, tracer->translator_state.fitness);
1064+
// Charge fitness by trace-buffer capacity consumed for this bytecode,
1065+
// including both emitted uops and tail reservations.
1066+
{
1067+
int32_t slots_fwd = (int32_t)(trace->next - next_before);
1068+
int32_t slots_rev = (int32_t)(end_before - trace->end);
1069+
int32_t slots_used = slots_fwd + slots_rev;
1070+
tracer->translator_state.fitness -= slots_used;
1071+
DPRINTF(3, " per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n",
1072+
slots_used, slots_fwd, slots_rev,
1073+
tracer->translator_state.fitness);
1074+
}
10831075
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
10841076
return 1;
10851077
done:
@@ -1163,16 +1155,15 @@ _PyJit_TryInitializeTracing(
11631155
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
11641156
tracer->initial_state.jump_backward_instr = curr_instr;
11651157

1166-
// Initialize fitness tracking state
1158+
// Reduce side-trace fitness as chain depth grows, but clamp the reduction
1159+
// after depth 4 so deeply chained exits still have at least half budget.
11671160
const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
11681161
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
1169-
bool is_side_trace = (exit != NULL);
1170-
ts->fitness = is_side_trace
1171-
? (int32_t)cfg->fitness_initial_side
1172-
: (int32_t)cfg->fitness_initial;
1162+
int effective_depth = Py_MIN(chain_depth, 4);
1163+
ts->fitness = (int32_t)((8 - effective_depth) * cfg->fitness_initial / 8);
11731164
ts->frame_depth = 0;
1174-
DPRINTF(3, "Fitness init: %s trace, fitness=%d\n",
1175-
is_side_trace ? "side" : "root", ts->fitness);
1165+
DPRINTF(3, "Fitness init: chain_depth=%d, fitness=%d\n",
1166+
chain_depth, ts->fitness);
11761167

11771168
tracer->is_tracing = true;
11781169
return 1;

Python/pystate.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -638,18 +638,7 @@ init_interpreter(PyInterpreterState *interp,
638638
// Trace fitness configuration
639639
init_policy(&interp->opt_config.fitness_initial,
640640
"PYTHON_JIT_FITNESS_INITIAL",
641-
FITNESS_INITIAL, 100, 10000);
642-
init_policy(&interp->opt_config.fitness_initial_side,
643-
"PYTHON_JIT_FITNESS_INITIAL_SIDE",
644-
FITNESS_INITIAL_SIDE, 50, 5000);
645-
/* The tracer starts at start_instr, so initial fitness must not be below
646-
* the close-loop exit quality or tracing will terminate immediately. */
647-
if (interp->opt_config.fitness_initial < EXIT_QUALITY_CLOSE_LOOP) {
648-
interp->opt_config.fitness_initial = EXIT_QUALITY_CLOSE_LOOP;
649-
}
650-
if (interp->opt_config.fitness_initial_side < EXIT_QUALITY_CLOSE_LOOP) {
651-
interp->opt_config.fitness_initial_side = EXIT_QUALITY_CLOSE_LOOP;
652-
}
641+
FITNESS_INITIAL, EXIT_QUALITY_CLOSE_LOOP, UOP_MAX_TRACE_LENGTH - 1);
653642

654643
interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
655644
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");

0 commit comments

Comments
 (0)