Skip to content

Commit b768a7c

Browse files
Rewrite side exit jumps in JIT
1 parent b3a3843 commit b768a7c

9 files changed

Lines changed: 128 additions & 7 deletions

File tree

Include/internal/pycore_jit.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ typedef _Py_CODEUNIT *(*jit_func)(_PyInterpreterFrame *frame, _PyStackRef *stack
2020
int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length);
2121
void _PyJIT_Free(_PyExecutorObject *executor);
2222

23+
PyAPI_DATA(int) _PyJit_PatchSideExit(_PyExecutorObject *trunk_executor, _PyExitData *exit_p, _PyExecutorObject *side_exit);
24+
2325
#endif // _Py_JIT
2426

2527
#ifdef __cplusplus

Include/internal/pycore_optimizer.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ typedef struct _PyExitData {
4747
uint16_t index;
4848
_Py_BackoffCounter temperature;
4949
struct _PyExecutorObject *executor;
50+
// Locations into the JIT code that contains
51+
// the side exits pointing to this exit.
52+
// This allows us to rewrite them to more efficient direct jumps
53+
// to side traces.
54+
uintptr_t exiting_uop_side_exit_locations[UOP_MAX_SIDE_EXITS_PER_UOP];
55+
int num_side_locations_used;
5056
} _PyExitData;
5157

5258
typedef struct _PyExecutorObject {

Include/internal/pycore_uop_ids.h

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/bytecodes.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5350,11 +5350,8 @@ dummy_func(
53505350
#ifndef _Py_JIT
53515351
assert(current_executor == (_PyExecutorObject*)executor);
53525352
#endif
5353-
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
53545353
tstate->current_executor = (PyObject *)executor;
53555354
if (!current_executor->vm_data.valid) {
5356-
assert(tstate->jit_exit->executor == current_executor);
5357-
assert(tstate->current_executor == executor);
53585355
_PyExecutor_ClearExit(tstate->jit_exit);
53595356
DEOPT_IF(true);
53605357
}
@@ -5425,6 +5422,13 @@ dummy_func(
54255422
exit->temperature = restart_backoff_counter(temperature);
54265423
GOTO_TIER_ONE(optimized < 0 ? NULL : target);
54275424
}
5425+
#ifdef _Py_JIT
5426+
int err = _PyJit_PatchSideExit(previous_executor, exit, executor);
5427+
if (err < 0) {
5428+
exit->temperature = restart_backoff_counter(temperature);
5429+
GOTO_TIER_ONE(NULL);
5430+
}
5431+
#endif
54285432
exit->temperature = initial_temperature_backoff_counter();
54295433
}
54305434
assert(tstate->jit_exit == exit);

Python/executor_cases.c.h

Lines changed: 9 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/jit.c

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "pycore_template.h"
2626
#include "pycore_tuple.h"
2727
#include "pycore_unicodeobject.h"
28+
#include "pycore_uop_metadata.h"
2829

2930
#include "pycore_jit.h"
3031

@@ -124,6 +125,26 @@ mark_executable(unsigned char *memory, size_t size)
124125
return 0;
125126
}
126127

128+
static int
129+
mark_read_writeable(unsigned char *memory, size_t size)
130+
{
131+
if (size == 0) {
132+
return 0;
133+
}
134+
assert(size % get_page_size() == 0);
135+
#ifdef MS_WINDOWS
136+
int old;
137+
int failed = !VirtualProtect(memory, size, PAGE_READWRITE, &old);
138+
#else
139+
int failed = mprotect(memory, size, PROT_WRITE | PROT_READ);
140+
#endif
141+
if (failed) {
142+
jit_error("unable to protect executable memory");
143+
return -1;
144+
}
145+
return 0;
146+
}
147+
127148
// JIT compiler stuff: /////////////////////////////////////////////////////////
128149

129150
#define SYMBOL_MASK_WORDS 4
@@ -138,6 +159,7 @@ typedef struct {
138159

139160
typedef struct {
140161
trampoline_state trampolines;
162+
_PyExecutorObject *executor;
141163
uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH];
142164
} jit_state;
143165

@@ -443,6 +465,22 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
443465
patch_32r(location, value);
444466
}
445467

468+
static void
469+
register_side_exit(const _PyUOpInstruction *instruction, unsigned char *loc, const _PyUOpInstruction *exit_trace)
470+
{
471+
if (!(_PyUop_Flags[instruction->opcode] & HAS_EXIT_FLAG)) {
472+
return;
473+
}
474+
assert(exit_trace->opcode == _EXIT_TRACE);
475+
_PyExitData *exit = (_PyExitData *)exit_trace->operand0;
476+
if (exit->num_side_locations_used >= UOP_MAX_SIDE_EXITS_PER_UOP) {
477+
return;
478+
}
479+
exit->exiting_uop_side_exit_locations[exit->num_side_locations_used] = (uintptr_t)loc;
480+
exit->num_side_locations_used++;
481+
}
482+
483+
446484
void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state);
447485

448486
#include "jit_stencils.h"
@@ -519,6 +557,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
519557
size_t code_size = 0;
520558
size_t data_size = 0;
521559
jit_state state = {0};
560+
state.executor = executor;
522561
for (size_t i = 0; i < length; i++) {
523562
const _PyUOpInstruction *instruction = &trace[i];
524563
group = &stencil_groups[instruction->opcode];
@@ -664,4 +703,47 @@ _PyJIT_Free(_PyExecutorObject *executor)
664703
}
665704
}
666705

706+
#ifdef __x86_64__
707+
static void
708+
patch_side_exit(unsigned char *location, uint64_t value)
709+
{
710+
uint64_t distance = (value - 0x4) - (uintptr_t)location;
711+
// Cannot fit in a 32-bit PC-relative address.
712+
if ((int64_t)distance < -(1LL << 31) || (int64_t)distance >= (1LL << 31)) {
713+
return;
714+
}
715+
patch_32r(location, value - 0x4);
716+
}
717+
#else
718+
// TODO AArch64
719+
static void
720+
patch_side_exit(unsigned char *location, uint64_t value)
721+
{
722+
(void)location;
723+
(void)value;
724+
return;
725+
}
726+
#endif
727+
728+
int
729+
_PyJit_PatchSideExit(_PyExecutorObject *trunk_executor, _PyExitData *exit_p, _PyExecutorObject *side_exit)
730+
{
731+
if (exit_p->num_side_locations_used > 0) {
732+
if (mark_read_writeable(trunk_executor->jit_code, trunk_executor->jit_size) < 0) {
733+
return -1;
734+
}
735+
uintptr_t new_target = (uintptr_t)side_exit->jit_code;
736+
for (int i = 0; i < exit_p->num_side_locations_used; i++) {
737+
uintptr_t loc = exit_p->exiting_uop_side_exit_locations[i];
738+
assert(loc >= (uintptr_t)trunk_executor->jit_code);
739+
assert(loc < (uintptr_t)((unsigned char *)trunk_executor->jit_code + trunk_executor->jit_size));
740+
patch_side_exit((unsigned char *)loc, new_target);
741+
}
742+
if (mark_executable(trunk_executor->jit_code, trunk_executor->jit_size) < 0) {
743+
return -1;
744+
}
745+
}
746+
return 0;
747+
}
748+
667749
#endif // _Py_JIT

Python/optimizer.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1209,6 +1209,7 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
12091209
executor->exits[i].index = i;
12101210
executor->exits[i].temperature = initial_temperature_backoff_counter();
12111211
executor->exits[i].executor = cold;
1212+
executor->exits[i].num_side_locations_used = 0;
12121213
}
12131214
int next_exit = exit_count-1;
12141215
_PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];

Tools/cases_generator/uop_id_generator.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@
2121
DEFAULT_OUTPUT = ROOT / "Include/internal/pycore_uop_ids.h"
2222

2323

24+
def generate_max_side_exit_count(analysis: Analysis, out: CWriter) -> None:
25+
max_side_exit_count = 1
26+
for uop in analysis.uops.values():
27+
if uop.is_viable() and uop.properties.tier != 1:
28+
uop_side_exit_count = 0
29+
for tok in uop.body.tokens():
30+
# Note: this may be fewer than the actual max
31+
# in the generated asm. However, it's a good enough estimate.
32+
if tok.text == "EXIT_IF":
33+
uop_side_exit_count += 1
34+
max_side_exit_count = max(max_side_exit_count, uop_side_exit_count)
35+
out.emit(f"#define UOP_MAX_SIDE_EXITS_PER_UOP {max_side_exit_count}\n")
36+
37+
2438
def generate_uop_ids(
2539
filenames: list[str], analysis: Analysis, outfile: TextIO, distinct_namespace: bool
2640
) -> None:
@@ -49,6 +63,7 @@ def generate_uop_ids(
4963
next_id += 1
5064

5165
out.emit(f"#define MAX_UOP_ID {next_id-1}\n")
66+
generate_max_side_exit_count(analysis, out)
5267

5368

5469
arg_parser = argparse.ArgumentParser(

Tools/jit/_stencils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,10 @@ def fold(self, other: typing.Self, body: bytearray) -> typing.Self | None:
170170
def as_c(self, where: str) -> str:
171171
"""Dump this hole as a call to a patch_* function."""
172172
location = f"{where} + {self.offset:#x}"
173+
register_jump_target = ""
174+
if self.value == HoleValue.JUMP_TARGET and self.kind == "R_X86_64_PLT32":
175+
assert self.func == "patch_32r"
176+
register_jump_target = f"register_side_exit(instruction, {location}, &state->executor->trace[instruction->jump_target]);\n "
173177
value = _HOLE_EXPRS[self.value]
174178
if self.symbol:
175179
if value:
@@ -181,7 +185,7 @@ def as_c(self, where: str) -> str:
181185
value += f"{_signed(self.addend):#x}"
182186
if self.need_state:
183187
return f"{self.func}({location}, {value}, state);"
184-
return f"{self.func}({location}, {value});"
188+
return f"{register_jump_target}{self.func}({location}, {value});"
185189

186190

187191
@dataclasses.dataclass

0 commit comments

Comments
 (0)