Skip to content

Commit 1bfa176

Browse files
committed
add fitness && exit quality mechanism
1 parent 8e9d21c commit 1bfa176

File tree

6 files changed

+225
-6
lines changed

6 files changed

+225
-6
lines changed

Include/cpython/pystats.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ typedef struct _optimization_stats {
144144
uint64_t unknown_callee;
145145
uint64_t trace_immediately_deopts;
146146
uint64_t executors_invalidated;
147+
uint64_t fitness_terminated_traces;
148+
uint64_t best_exit_fallback;
147149
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
148150
uint64_t unsupported_opcode[256];
149151
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];

Include/internal/pycore_interp_structs.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,15 @@ typedef struct _PyOptimizationConfig {
449449
uint16_t side_exit_initial_value;
450450
uint16_t side_exit_initial_backoff;
451451

452+
// Trace fitness thresholds
453+
uint16_t fitness_initial;
454+
uint16_t fitness_initial_side;
455+
uint16_t fitness_per_instruction;
456+
uint16_t fitness_branch_biased;
457+
uint16_t fitness_branch_unbiased;
458+
uint16_t fitness_backward_edge;
459+
uint16_t fitness_frame_entry;
460+
452461
// Optimization flags
453462
bool specialization_enabled;
454463
bool uops_optimize_enabled;

Include/internal/pycore_optimizer.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@ extern "C" {
1515
#include "pycore_optimizer_types.h"
1616
#include <stdbool.h>
1717

18+
/* Default fitness configuration values for trace quality control.
19+
* These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */
20+
#define FITNESS_INITIAL 1000
21+
#define FITNESS_INITIAL_SIDE 800
22+
#define FITNESS_PER_INSTRUCTION 2
23+
#define FITNESS_BRANCH_BIASED 5
24+
#define FITNESS_BRANCH_UNBIASED 25
25+
#define FITNESS_BACKWARD_EDGE 80
26+
#define FITNESS_FRAME_ENTRY 10
27+
1828

1929
typedef struct _PyJitUopBuffer {
2030
_PyUOpInstruction *start;
@@ -101,7 +111,11 @@ typedef struct _PyJitTracerPreviousState {
101111
} _PyJitTracerPreviousState;
102112

103113
typedef struct _PyJitTracerTranslatorState {
104-
int jump_backward_seen;
114+
int32_t fitness; // Current trace fitness, starts high, decrements
115+
int32_t best_exit_quality; // Best exit quality seen so far
116+
int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none)
117+
uint32_t best_exit_target; // Bytecode target of best exit point
118+
int frame_depth; // Current inline depth (0 = root frame)
105119
} _PyJitTracerTranslatorState;
106120

107121
typedef struct _PyJitTracerState {

Python/optimizer.c

Lines changed: 174 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -549,8 +549,11 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
549549
};
550550

551551

552-
#define CONFIDENCE_RANGE 1000
553-
#define CONFIDENCE_CUTOFF 333
552+
/* Exit quality constants for fitness-based trace termination.
553+
* Higher values mean better places to stop the trace. */
554+
#define EXIT_QUALITY_ENTER_EXECUTOR 500 // An executor already exists here
555+
#define EXIT_QUALITY_DEFAULT 200 // Ordinary bytecode position
556+
#define EXIT_QUALITY_SPECIALIZABLE 50 // Specializable instruction — avoid stopping here
554557

555558
#ifdef Py_DEBUG
556559
#define DPRINTF(level, ...) \
@@ -598,6 +601,86 @@ add_to_trace(
598601
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
599602

600603

604+
/* Compute branch bias from the 16-bit branch history register.
605+
* Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */
606+
static inline int
607+
compute_branch_bias(uint16_t history)
608+
{
609+
int ones = _Py_popcount32((uint32_t)history);
610+
return abs(ones - 8);
611+
}
612+
613+
/* Compute exit quality for the current trace position.
614+
* Higher values mean it's a better place to stop the trace. */
615+
static inline int32_t
616+
compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode)
617+
{
618+
if (target_instr->op.code == ENTER_EXECUTOR) {
619+
return EXIT_QUALITY_ENTER_EXECUTOR;
620+
}
621+
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
622+
return EXIT_QUALITY_SPECIALIZABLE;
623+
}
624+
return EXIT_QUALITY_DEFAULT;
625+
}
626+
627+
/* Try to truncate the trace to the best recorded exit point.
628+
* Returns 1 if successful, 0 if no valid best exit exists.
629+
* Enforces progress constraints: the fallback position must satisfy
630+
* the minimum trace length requirements. */
631+
static inline int
632+
try_best_exit_fallback(
633+
_PyJitUopBuffer *trace,
634+
_PyJitTracerTranslatorState *ts,
635+
bool progress_needed)
636+
{
637+
int best_pos = ts->best_exit_buffer_pos;
638+
if (best_pos <= 0) {
639+
return 0;
640+
} else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) {
641+
return 0;
642+
} else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) {
643+
return 0;
644+
}
645+
trace->next = trace->start + best_pos;
646+
/* Caller must add terminator (_EXIT_TRACE) after this */
647+
return 1;
648+
}
649+
650+
/* Update trace fitness after translating one bytecode instruction. */
651+
static inline void
652+
update_trace_fitness(
653+
_PyJitTracerTranslatorState *ts,
654+
int opcode,
655+
_Py_CODEUNIT *target_instr,
656+
const _PyOptimizationConfig *cfg)
657+
{
658+
ts->fitness -= cfg->fitness_per_instruction;
659+
660+
switch (opcode) {
661+
case POP_JUMP_IF_FALSE:
662+
case POP_JUMP_IF_TRUE:
663+
case POP_JUMP_IF_NONE:
664+
case POP_JUMP_IF_NOT_NONE: {
665+
int bias = compute_branch_bias(target_instr[1].cache);
666+
/* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */
667+
int penalty = cfg->fitness_branch_unbiased
668+
- (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8;
669+
ts->fitness -= penalty;
670+
break;
671+
}
672+
case JUMP_BACKWARD:
673+
case JUMP_BACKWARD_JIT:
674+
case JUMP_BACKWARD_NO_JIT:
675+
ts->fitness -= cfg->fitness_backward_edge;
676+
break;
677+
/* JUMP_BACKWARD_NO_INTERRUPT: exempt from backward edge penalty (coroutines) */
678+
default:
679+
break;
680+
}
681+
}
682+
683+
601684
static int
602685
is_terminator(const _PyUOpInstruction *uop)
603686
{
@@ -730,17 +813,46 @@ _PyJit_translate_single_bytecode_to_trace(
730813
goto unsupported;
731814
}
732815

816+
// Track frame depth changes for fitness (only for supported frame transitions)
817+
if (frame != tracer->prev_state.instr_frame) {
818+
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
819+
if (frame->previous == tracer->prev_state.instr_frame) {
820+
// Entered a deeper frame (function call inlined)
821+
ts_depth->frame_depth++;
822+
// Penalty scales with depth: shallow inlining is cheap,
823+
// deep inlining gets progressively more expensive.
824+
int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry
825+
* ts_depth->frame_depth;
826+
ts_depth->fitness -= penalty;
827+
} else if (ts_depth->frame_depth > 0) {
828+
// Returned to a shallower frame
829+
ts_depth->frame_depth--;
830+
}
831+
}
832+
733833
if (oparg > 0xFFFF) {
734834
DPRINTF(2, "Unsupported: oparg too large\n");
735835
unsupported:
736836
{
737-
// Rewind to previous instruction and replace with _EXIT_TRACE.
837+
// If we have a high-quality best_exit (enter_executor, etc.),
838+
// prefer it over rewinding to last _SET_IP — this covers the
839+
// main unsupported path, not just the edge case.
840+
_PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state;
841+
if (ts_unsup->best_exit_quality > EXIT_QUALITY_DEFAULT &&
842+
try_best_exit_fallback(trace, ts_unsup, progress_needed)) {
843+
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target);
844+
uop_buffer_last(trace)->operand1 = true; // is_control_flow
845+
OPT_STAT_INC(best_exit_fallback);
846+
DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n",
847+
ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality);
848+
goto done;
849+
}
850+
// Fall back: rewind to last _SET_IP and replace with _DEOPT.
738851
_PyUOpInstruction *curr = uop_buffer_last(trace);
739852
while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
740853
trace->next--;
741854
curr = uop_buffer_last(trace);
742855
}
743-
assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
744856
if (curr->opcode == _SET_IP) {
745857
int32_t old_target = (int32_t)uop_get_target(curr);
746858
curr->opcode = _DEOPT;
@@ -763,6 +875,40 @@ _PyJit_translate_single_bytecode_to_trace(
763875
return 1;
764876
}
765877

878+
// Fitness-based trace quality check (before reserving space for this instruction)
879+
{
880+
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
881+
int32_t eq = compute_exit_quality(target_instr, opcode);
882+
883+
// Record best exit candidate.
884+
// Only record after minimum progress to avoid truncating to near-empty traces.
885+
if (eq > ts->best_exit_quality &&
886+
uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
887+
ts->best_exit_quality = eq;
888+
ts->best_exit_buffer_pos = uop_buffer_length(trace);
889+
ts->best_exit_target = target;
890+
}
891+
892+
// Check if fitness is depleted — should we stop the trace?
893+
if (ts->fitness < eq &&
894+
!(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) {
895+
// Prefer stopping at the best recorded exit point
896+
if (try_best_exit_fallback(trace, ts, progress_needed)) {
897+
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target);
898+
uop_buffer_last(trace)->operand1 = true; // is_control_flow
899+
}
900+
else {
901+
// No valid best exit — stop at current position
902+
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
903+
uop_buffer_last(trace)->operand1 = true; // is_control_flow
904+
}
905+
OPT_STAT_INC(fitness_terminated_traces);
906+
DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n",
907+
ts->fitness, eq);
908+
goto done;
909+
}
910+
}
911+
766912
// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
767913
trace->end -= 2;
768914

@@ -793,6 +939,12 @@ _PyJit_translate_single_bytecode_to_trace(
793939
DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
794940
space_needed, uop_buffer_remaining_space(trace));
795941
OPT_STAT_INC(trace_too_long);
942+
// Try best-exit fallback before giving up
943+
if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) {
944+
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target);
945+
uop_buffer_last(trace)->operand1 = true; // is_control_flow
946+
OPT_STAT_INC(best_exit_fallback);
947+
}
796948
goto done;
797949
}
798950

@@ -986,7 +1138,12 @@ _PyJit_translate_single_bytecode_to_trace(
9861138
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
9871139
goto done;
9881140
}
989-
DPRINTF(2, "Trace continuing\n");
1141+
// Update fitness AFTER translation, BEFORE returning to continue tracing.
1142+
// This ensures the next iteration's fitness check reflects the cost of
1143+
// all instructions translated so far.
1144+
update_trace_fitness(&tracer->translator_state, opcode, target_instr,
1145+
&tstate->interp->opt_config);
1146+
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
9901147
return 1;
9911148
done:
9921149
DPRINTF(2, "Trace done\n");
@@ -1069,6 +1226,18 @@ _PyJit_TryInitializeTracing(
10691226
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
10701227
tracer->initial_state.jump_backward_instr = curr_instr;
10711228

1229+
// Initialize fitness tracking state
1230+
const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
1231+
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
1232+
bool is_side_trace = (exit != NULL);
1233+
ts->fitness = is_side_trace
1234+
? (int32_t)cfg->fitness_initial_side
1235+
: (int32_t)cfg->fitness_initial;
1236+
ts->best_exit_quality = 0;
1237+
ts->best_exit_buffer_pos = -1;
1238+
ts->best_exit_target = 0;
1239+
ts->frame_depth = 0;
1240+
10721241
tracer->is_tracing = true;
10731242
return 1;
10741243
}

Python/pystate.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,29 @@ init_interpreter(PyInterpreterState *interp,
635635
"PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
636636
SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);
637637

638+
// Trace fitness configuration
639+
init_policy(&interp->opt_config.fitness_initial,
640+
"PYTHON_JIT_FITNESS_INITIAL",
641+
FITNESS_INITIAL, 100, 10000);
642+
init_policy(&interp->opt_config.fitness_initial_side,
643+
"PYTHON_JIT_FITNESS_INITIAL_SIDE",
644+
FITNESS_INITIAL_SIDE, 50, 5000);
645+
init_policy(&interp->opt_config.fitness_per_instruction,
646+
"PYTHON_JIT_FITNESS_PER_INSTRUCTION",
647+
FITNESS_PER_INSTRUCTION, 0, 100);
648+
init_policy(&interp->opt_config.fitness_branch_biased,
649+
"PYTHON_JIT_FITNESS_BRANCH_BIASED",
650+
FITNESS_BRANCH_BIASED, 0, 500);
651+
init_policy(&interp->opt_config.fitness_branch_unbiased,
652+
"PYTHON_JIT_FITNESS_BRANCH_UNBIASED",
653+
FITNESS_BRANCH_UNBIASED, 0, 500);
654+
init_policy(&interp->opt_config.fitness_backward_edge,
655+
"PYTHON_JIT_FITNESS_BACKWARD_EDGE",
656+
FITNESS_BACKWARD_EDGE, 0, 1000);
657+
init_policy(&interp->opt_config.fitness_frame_entry,
658+
"PYTHON_JIT_FITNESS_FRAME_ENTRY",
659+
FITNESS_FRAME_ENTRY, 0, 1000);
660+
638661
interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
639662
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
640663
if (interp != &runtime->_main_interpreter) {

Python/pystats.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,8 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
274274
fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence);
275275
fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee);
276276
fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated);
277+
fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces);
278+
fprintf(out, "Optimization best exit fallback: %" PRIu64 "\n", stats->best_exit_fallback);
277279

278280
print_histogram(out, "Trace length", stats->trace_length_hist);
279281
print_histogram(out, "Trace run length", stats->trace_run_length_hist);

0 commit comments

Comments
 (0)