Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Include/cpython/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ typedef struct _optimization_stats {
uint64_t unknown_callee;
uint64_t trace_immediately_deopts;
uint64_t executors_invalidated;
uint64_t fitness_terminated_traces;
uint64_t best_exit_fallback;
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
uint64_t unsupported_opcode[256];
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
Expand Down
14 changes: 14 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,20 @@ typedef struct _PyOptimizationConfig {
uint16_t side_exit_initial_value;
uint16_t side_exit_initial_backoff;

// Trace fitness thresholds
Comment thread
cocolato marked this conversation as resolved.
uint16_t fitness_initial;
uint16_t fitness_initial_side;
uint16_t fitness_per_instruction;
uint16_t fitness_branch_biased;
uint16_t fitness_branch_unbiased;
uint16_t fitness_backward_edge;
uint16_t fitness_frame_entry;

// Exit quality thresholds for fitness-based trace termination
uint16_t exit_quality_enter_executor;
uint16_t exit_quality_default;
uint16_t exit_quality_specializable;

// Optimization flags
bool specialization_enabled;
bool uops_optimize_enabled;
Expand Down
23 changes: 22 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@ extern "C" {
#include "pycore_optimizer_types.h"
#include <stdbool.h>

/* Default fitness configuration values for trace quality control.
* These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */
#define FITNESS_INITIAL 1000
Comment thread
cocolato marked this conversation as resolved.
Outdated
#define FITNESS_INITIAL_SIDE 800
#define FITNESS_PER_INSTRUCTION 2
#define FITNESS_BRANCH_BIASED 5
#define FITNESS_BRANCH_UNBIASED 25
#define FITNESS_BACKWARD_EDGE 80
#define FITNESS_FRAME_ENTRY 10

/* Default exit quality constants for fitness-based trace termination.
* Higher values mean better places to stop the trace.
* These can be overridden via PYTHON_JIT_EXIT_QUALITY_* environment variables. */
#define EXIT_QUALITY_ENTER_EXECUTOR 500
#define EXIT_QUALITY_DEFAULT 200
#define EXIT_QUALITY_SPECIALIZABLE 50
Comment thread
cocolato marked this conversation as resolved.
Outdated


typedef struct _PyJitUopBuffer {
_PyUOpInstruction *start;
Expand Down Expand Up @@ -101,7 +118,11 @@ typedef struct _PyJitTracerPreviousState {
} _PyJitTracerPreviousState;

typedef struct _PyJitTracerTranslatorState {
int jump_backward_seen;
int32_t fitness; // Current trace fitness, starts high, decrements
int32_t best_exit_quality; // Best exit quality seen so far
int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none)
uint32_t best_exit_target; // Bytecode target of best exit point
int frame_depth; // Current inline depth (0 = root frame)
} _PyJitTracerTranslatorState;

typedef struct _PyJitTracerState {
Expand Down
171 changes: 166 additions & 5 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
};


#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333

#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
Expand Down Expand Up @@ -598,6 +596,86 @@ add_to_trace(
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))


/* Compute branch bias from the 16-bit branch history register.
* Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */
static inline int
Comment thread
cocolato marked this conversation as resolved.
compute_branch_bias(uint16_t history)
{
int ones = _Py_popcount32((uint32_t)history);
return abs(ones - 8);
}

/* Compute exit quality for the current trace position.
* Higher values mean it's a better place to stop the trace. */
static inline int32_t
compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
Comment thread
cocolato marked this conversation as resolved.
const _PyOptimizationConfig *cfg)
{
if (target_instr->op.code == ENTER_EXECUTOR) {
return (int32_t)cfg->exit_quality_enter_executor;
}
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
return (int32_t)cfg->exit_quality_specializable;
}
return (int32_t)cfg->exit_quality_default;
}

/* Try to truncate the trace to the best recorded exit point.
* Returns 1 if successful, 0 if no valid best exit exists.
* Enforces progress constraints: the fallback position must satisfy
* the minimum trace length requirements. */
static inline int
Comment thread
cocolato marked this conversation as resolved.
Outdated
try_best_exit_fallback(
_PyJitUopBuffer *trace,
_PyJitTracerTranslatorState *ts,
bool progress_needed)
{
int best_pos = ts->best_exit_buffer_pos;
if (best_pos <= 0) {
return 0;
} else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) {
return 0;
} else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) {
return 0;
}
trace->next = trace->start + best_pos;
/* Caller must add terminator (_EXIT_TRACE) after this */
return 1;
}

/* Update trace fitness after translating one bytecode instruction. */
static inline void
update_trace_fitness(
Comment thread
cocolato marked this conversation as resolved.
Outdated
_PyJitTracerTranslatorState *ts,
int opcode,
_Py_CODEUNIT *target_instr,
const _PyOptimizationConfig *cfg)
{
ts->fitness -= cfg->fitness_per_instruction;

switch (opcode) {
case POP_JUMP_IF_FALSE:
case POP_JUMP_IF_TRUE:
case POP_JUMP_IF_NONE:
case POP_JUMP_IF_NOT_NONE: {
int bias = compute_branch_bias(target_instr[1].cache);
/* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */
int penalty = cfg->fitness_branch_unbiased
- (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8;
ts->fitness -= penalty;
break;
}
case JUMP_BACKWARD:
case JUMP_BACKWARD_JIT:
case JUMP_BACKWARD_NO_JIT:
ts->fitness -= cfg->fitness_backward_edge;
break;
default:
break;
}
}
Comment on lines +633 to +637
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The newest result with lower frame penalty:

+----------------------+----------+-----------------------+
| Benchmark            | baseline | fitness               |
+======================+==========+=======================+
| raytrace             | 262 ms   | 212 ms: 1.24x faster  |
+----------------------+----------+-----------------------+
| pickle_pure_python   | 277 us   | 254 us: 1.09x faster  |
+----------------------+----------+-----------------------+
| go                   | 83.7 ms  | 78.8 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_iterparse  | 74.1 ms  | 69.9 ms: 1.06x faster |
+----------------------+----------+-----------------------+
| xml_etree_process    | 50.4 ms  | 48.3 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_generate   | 77.3 ms  | 74.1 ms: 1.04x faster |
+----------------------+----------+-----------------------+
| xml_etree_parse      | 122 ms   | 119 ms: 1.03x faster  |
+----------------------+----------+-----------------------+
| regex_compile        | 103 ms   | 99.7 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| deltablue            | 2.19 ms  | 2.14 ms: 1.03x faster |
+----------------------+----------+-----------------------+
| unpickle_pure_python | 171 us   | 167 us: 1.02x faster  |
+----------------------+----------+-----------------------+
| regex_effbot         | 2.16 ms  | 2.12 ms: 1.02x faster |
+----------------------+----------+-----------------------+
| fannkuch             | 253 ms   | 249 ms: 1.02x faster  |
+----------------------+----------+-----------------------+
| json_loads           | 19.3 us  | 19.1 us: 1.01x faster |
+----------------------+----------+-----------------------+
| json_dumps           | 7.60 ms  | 7.66 ms: 1.01x slower |
+----------------------+----------+-----------------------+
| pidigits             | 136 ms   | 138 ms: 1.01x slower  |
+----------------------+----------+-----------------------+
| pyflate              | 267 ms   | 273 ms: 1.02x slower  |
+----------------------+----------+-----------------------+
| float                | 45.2 ms  | 46.2 ms: 1.02x slower |
+----------------------+----------+-----------------------+
| richards             | 16.5 ms  | 17.3 ms: 1.05x slower |
+----------------------+----------+-----------------------+
| Geometric mean       | (ref)    | 1.03x faster          |
+----------------------+----------+-----------------------+



static int
is_terminator(const _PyUOpInstruction *uop)
{
Expand Down Expand Up @@ -730,17 +808,44 @@ _PyJit_translate_single_bytecode_to_trace(
goto unsupported;
}

// Track frame depth changes for fitness (only for supported frame transitions)
if (frame != tracer->prev_state.instr_frame) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
if (frame->previous == tracer->prev_state.instr_frame) {
ts_depth->frame_depth++;
// Penalty scales with depth: shallow inlining is cheap,
// deep inlining gets progressively more expensive.
int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry
* ts_depth->frame_depth;
ts_depth->fitness -= penalty;
} else if (ts_depth->frame_depth > 0) {
ts_depth->frame_depth--;
}
}
Comment thread
cocolato marked this conversation as resolved.
Outdated

if (oparg > 0xFFFF) {
DPRINTF(2, "Unsupported: oparg too large\n");
unsupported:
{
// Rewind to previous instruction and replace with _EXIT_TRACE.
// If we have a high-quality best_exit (enter_executor, etc.),
Comment thread
cocolato marked this conversation as resolved.
Outdated
// prefer it over rewinding to last _SET_IP — this covers the
// main unsupported path, not just the edge case.
_PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state;
if (ts_unsup->best_exit_quality > (int32_t)tstate->interp->opt_config.exit_quality_default &&
try_best_exit_fallback(trace, ts_unsup, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
OPT_STAT_INC(best_exit_fallback);
DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n",
ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality);
goto done;
}
// Fall back: rewind to last _SET_IP and replace with _DEOPT.
_PyUOpInstruction *curr = uop_buffer_last(trace);
while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
trace->next--;
curr = uop_buffer_last(trace);
}
assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
if (curr->opcode == _SET_IP) {
int32_t old_target = (int32_t)uop_get_target(curr);
curr->opcode = _DEOPT;
Expand All @@ -763,6 +868,39 @@ _PyJit_translate_single_bytecode_to_trace(
return 1;
}

// Fitness-based trace quality check (before reserving space for this instruction)
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
int32_t eq = compute_exit_quality(target_instr, opcode,
&tstate->interp->opt_config);

// Record best exit candidate.
// Only record after minimum progress to avoid truncating to near-empty traces.
if (eq > ts->best_exit_quality &&
uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) {
ts->best_exit_quality = eq;
ts->best_exit_buffer_pos = uop_buffer_length(trace);
ts->best_exit_target = target;
}

// Check if fitness is depleted — should we stop the trace?
if (ts->fitness < eq &&
!(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) {
Comment thread
cocolato marked this conversation as resolved.
Outdated
// Prefer stopping at the best recorded exit point
Comment thread
cocolato marked this conversation as resolved.
Outdated
if (try_best_exit_fallback(trace, ts, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
}
else {
// No valid best exit — stop at current position
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
Comment thread
cocolato marked this conversation as resolved.
Outdated
}
OPT_STAT_INC(fitness_terminated_traces);
DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n",
ts->fitness, eq);
goto done;
}

// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
trace->end -= 2;

Expand Down Expand Up @@ -793,6 +931,12 @@ _PyJit_translate_single_bytecode_to_trace(
DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
space_needed, uop_buffer_remaining_space(trace));
OPT_STAT_INC(trace_too_long);
// Try best-exit fallback before giving up
if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
OPT_STAT_INC(best_exit_fallback);
}
goto done;
}

Expand Down Expand Up @@ -986,7 +1130,12 @@ _PyJit_translate_single_bytecode_to_trace(
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
goto done;
}
DPRINTF(2, "Trace continuing\n");
// Update fitness AFTER translation, BEFORE returning to continue tracing.
// This ensures the next iteration's fitness check reflects the cost of
// all instructions translated so far.
update_trace_fitness(&tracer->translator_state, opcode, target_instr,
&tstate->interp->opt_config);
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
return 1;
done:
DPRINTF(2, "Trace done\n");
Expand Down Expand Up @@ -1069,6 +1218,18 @@ _PyJit_TryInitializeTracing(
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
tracer->initial_state.jump_backward_instr = curr_instr;

// Initialize fitness tracking state
const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
bool is_side_trace = (exit != NULL);
ts->fitness = is_side_trace
? (int32_t)cfg->fitness_initial_side
: (int32_t)cfg->fitness_initial;
ts->best_exit_quality = 0;
ts->best_exit_buffer_pos = -1;
ts->best_exit_target = 0;
ts->frame_depth = 0;

tracer->is_tracing = true;
return 1;
}
Expand Down
34 changes: 34 additions & 0 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,40 @@ init_interpreter(PyInterpreterState *interp,
"PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);

// Trace fitness configuration
init_policy(&interp->opt_config.fitness_initial,
"PYTHON_JIT_FITNESS_INITIAL",
FITNESS_INITIAL, 100, 10000);
init_policy(&interp->opt_config.fitness_initial_side,
"PYTHON_JIT_FITNESS_INITIAL_SIDE",
FITNESS_INITIAL_SIDE, 50, 5000);
init_policy(&interp->opt_config.fitness_per_instruction,
"PYTHON_JIT_FITNESS_PER_INSTRUCTION",
FITNESS_PER_INSTRUCTION, 0, 100);
init_policy(&interp->opt_config.fitness_branch_biased,
"PYTHON_JIT_FITNESS_BRANCH_BIASED",
FITNESS_BRANCH_BIASED, 0, 500);
init_policy(&interp->opt_config.fitness_branch_unbiased,
"PYTHON_JIT_FITNESS_BRANCH_UNBIASED",
FITNESS_BRANCH_UNBIASED, 0, 500);
init_policy(&interp->opt_config.fitness_backward_edge,
"PYTHON_JIT_FITNESS_BACKWARD_EDGE",
FITNESS_BACKWARD_EDGE, 0, 1000);
init_policy(&interp->opt_config.fitness_frame_entry,
"PYTHON_JIT_FITNESS_FRAME_ENTRY",
FITNESS_FRAME_ENTRY, 0, 1000);

// Exit quality thresholds
init_policy(&interp->opt_config.exit_quality_enter_executor,
"PYTHON_JIT_EXIT_QUALITY_ENTER_EXECUTOR",
EXIT_QUALITY_ENTER_EXECUTOR, 0, 10000);
init_policy(&interp->opt_config.exit_quality_default,
"PYTHON_JIT_EXIT_QUALITY_DEFAULT",
EXIT_QUALITY_DEFAULT, 0, 10000);
init_policy(&interp->opt_config.exit_quality_specializable,
"PYTHON_JIT_EXIT_QUALITY_SPECIALIZABLE",
EXIT_QUALITY_SPECIALIZABLE, 0, 10000);

interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
if (interp != &runtime->_main_interpreter) {
Expand Down
2 changes: 2 additions & 0 deletions Python/pystats.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence);
fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee);
fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated);
fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces);
fprintf(out, "Optimization best exit fallback: %" PRIu64 "\n", stats->best_exit_fallback);

print_histogram(out, "Trace length", stats->trace_length_hist);
print_histogram(out, "Trace run length", stats->trace_run_length_hist);
Expand Down
Loading