Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1bfa176
add fitness && exit quality mechanism
cocolato Mar 31, 2026
2f9438a
Rewrite the code structure
cocolato Apr 1, 2026
709c0a1
address review
cocolato Apr 1, 2026
ef6ac24
address many reviews
cocolato Apr 2, 2026
21f7122
Merge branch 'main' into jit-tracer-fitness
cocolato Apr 2, 2026
b99fe61
optimize some constants
cocolato Apr 2, 2026
d09afb5
fix comment
cocolato Apr 2, 2026
c9957c3
fix constent
cocolato Apr 2, 2026
9447546
reduce frame penalty
cocolato Apr 3, 2026
7d3e4c4
add debug log
cocolato Apr 3, 2026
2c1b5e0
address review
cocolato Apr 3, 2026
2409b2f
address review
cocolato Apr 3, 2026
88a91dc
Merge branch 'python:main' into jit-tracer-fitness
cocolato Apr 4, 2026
4e12f04
Merge branch 'main' into jit-tracer-fitness
cocolato Apr 6, 2026
4bd251e
fine tune parameters
cocolato Apr 6, 2026
1d93208
remove some special cases
cocolato Apr 6, 2026
386c23a
Merge branch 'main' into jit-tracer-fitness
cocolato Apr 10, 2026
83fd8ab
rewrite fitness mechanism
cocolato Apr 10, 2026
c900563
remove static assert
cocolato Apr 10, 2026
97d8be4
Merge branch 'main' into jit-tracer-fitness
cocolato Apr 12, 2026
559b164
Merge branch 'main' into jit-tracer-fitness
cocolato Apr 14, 2026
7a5e1fe
address partial review
cocolato Apr 14, 2026
9324df0
restore slots_rev
cocolato Apr 14, 2026
e69443b
address review
cocolato Apr 14, 2026
751a1d9
Race MAX_TARGET_LENGTH to 800, compute branch after slots, ignore ENT…
Fidget-Spinner Apr 14, 2026
896e4fe
reduce MAX_TARGET_LENGTH
Fidget-Spinner Apr 14, 2026
1364159
fix tests
Fidget-Spinner Apr 14, 2026
9fbec75
fix a bug
Fidget-Spinner Apr 14, 2026
76b9c9e
magic numbers
Fidget-Spinner Apr 14, 2026
d565f41
lint
Fidget-Spinner Apr 14, 2026
598d332
reduce the trace length to less than half
Fidget-Spinner Apr 14, 2026
64f3468
Address review
Fidget-Spinner Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/cpython/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ typedef struct _optimization_stats {
uint64_t unknown_callee;
uint64_t trace_immediately_deopts;
uint64_t executors_invalidated;
uint64_t fitness_terminated_traces;
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
uint64_t unsupported_opcode[256];
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,9 @@ typedef struct _PyOptimizationConfig {
uint16_t side_exit_initial_value;
uint16_t side_exit_initial_backoff;

// Trace fitness thresholds
uint16_t fitness_initial;

// Optimization flags
bool specialization_enabled;
bool uops_optimize_enabled;
Expand Down
46 changes: 45 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,49 @@ extern "C" {
#include "pycore_optimizer_types.h"
#include <stdbool.h>

/* Fitness controls how long a trace can grow.
* Starts at FITNESS_INITIAL, then decreases from per-bytecode buffer usage
* plus branch/frame heuristics. The trace stops when fitness drops below the
* current exit_quality.
*
* Design targets for the constants below:
* 1. Reaching the abstract frame-depth limit should drop fitness below
* EXIT_QUALITY_SPECIALIZABLE.
* 2. A backward edge should leave budget for roughly N_BACKWARD_SLACK more
* bytecodes, assuming AVG_SLOTS_PER_INSTRUCTION.
* 3. Roughly seven balanced branches should reduce fitness to
* EXIT_QUALITY_DEFAULT after per-slot costs.
* 4. A push followed by a matching return is net-zero on frame-specific
* fitness, excluding per-slot costs.
*/
#define MAX_TARGET_LENGTH (UOP_MAX_TRACE_LENGTH / 5 * 2)
#define OPTIMIZER_EFFECTIVENESS 2
#define FITNESS_INITIAL (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS)

/* Exit quality thresholds: trace stops when fitness < exit_quality.
* Higher = trace is more willing to stop here. */
#define EXIT_QUALITY_CLOSE_LOOP (FITNESS_INITIAL - AVG_SLOTS_PER_INSTRUCTION*4)
#define EXIT_QUALITY_ENTER_EXECUTOR (FITNESS_INITIAL * 3 / 8)
#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 8)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd increase this to make sure that the fitness cannot drop from above EXIT_QUALITY_DEFAULT to below EXIT_QUALITY_SPECIALIZABLE in a single uop.

#define EXIT_QUALITY_SPECIALIZABLE (FITNESS_INITIAL / 80)

/* Estimated buffer slots per bytecode, used only to derive heuristics.
* Runtime charging uses trace-buffer capacity consumed for each bytecode. */
#define AVG_SLOTS_PER_INSTRUCTION 6

/* Heuristic backward-edge exit quality: leave room for about 1 unroll and
* N_BACKWARD_SLACK more bytecodes before reaching EXIT_QUALITY_CLOSE_LOOP,
* based on AVG_SLOTS_PER_INSTRUCTION. */
#define N_BACKWARD_SLACK 10
#define EXIT_QUALITY_BACKWARD_EDGE (EXIT_QUALITY_CLOSE_LOOP / 2 - N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE:

The problem here is that when tracing loops, we are treating the start of the loop as the closing point, but we want to stop at the end of the loop otherwise.
We probably need to make the back edge quality calculation a bit more complex.

  • if the jump is to the loop closing point: exit_quality = 0 (to ensure loop is closed)
  • otherwise: exit_quality = high ~(FITNESS - 10 * AVG_SLOTS_PER_INSTRUCTION)

(this can be fixed in a separate PR if would complicate this PR too much)


/* Penalty for a perfectly balanced (50/50) branch.
* 7 such branches (after per-slot cost) exhaust fitness to EXIT_QUALITY_DEFAULT.
* The calculation assumes the branches are spread out roughly equally throughout the trace.
*/
#define FITNESS_BRANCH_BALANCED ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT - \
(MAX_TARGET_LENGTH / 7 * AVG_SLOTS_PER_INSTRUCTION)) / (7))


typedef struct _PyJitUopBuffer {
_PyUOpInstruction *start;
Expand Down Expand Up @@ -101,7 +144,8 @@ typedef struct _PyJitTracerPreviousState {
} _PyJitTracerPreviousState;

typedef struct _PyJitTracerTranslatorState {
int jump_backward_seen;
int32_t fitness; // Current trace fitness, starts high, decrements
int frame_depth; // Current inline depth (0 = root frame)
} _PyJitTracerTranslatorState;

typedef struct _PyJitTracerState {
Expand Down
8 changes: 6 additions & 2 deletions Lib/test/test_capi/test_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1358,9 +1358,13 @@ def testfunc(n):
for _ in gen(n):
pass
testfunc(TIER2_THRESHOLD * 2)
# The generator may be inlined into testfunc's trace,
# so check whichever executor contains _YIELD_VALUE.
gen_ex = get_first_executor(gen)
self.assertIsNotNone(gen_ex)
uops = get_opnames(gen_ex)
testfunc_ex = get_first_executor(testfunc)
ex = gen_ex or testfunc_ex
self.assertIsNotNone(ex)
uops = get_opnames(ex)
self.assertNotIn("_MAKE_HEAP_SAFE", uops)
self.assertIn("_YIELD_VALUE", uops)

Expand Down
5 changes: 4 additions & 1 deletion Modules/_testinternalcapi/test_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -6359,7 +6359,10 @@ dummy_func(
tracer->prev_state.instr_frame = frame;
tracer->prev_state.instr_oparg = oparg;
tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL();
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) {
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]
// Branch opcodes use the cache for branch history, not
// specialization counters. Don't reset it.
&& !IS_CONDITIONAL_JUMP_OPCODE(opcode)) {
(&next_instr[1])->counter = trigger_backoff_counter();
}

Expand Down
5 changes: 4 additions & 1 deletion Python/generated_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

163 changes: 127 additions & 36 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = {
};


#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333

#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
Expand Down Expand Up @@ -598,6 +596,48 @@ add_to_trace(
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))


/* Branch penalty: 0 if fully biased, FITNESS_BRANCH_BALANCED if 50/50,
* 2*FITNESS_BRANCH_BALANCED if fully against the traced direction. */
static inline int
compute_branch_penalty(uint16_t history)
{
bool branch_taken = history & 1;
int taken_count = _Py_popcount32((uint32_t)history);
int on_trace_count = branch_taken ? taken_count : 16 - taken_count;
int off_trace = 16 - on_trace_count;
return off_trace * FITNESS_BRANCH_BALANCED / 8;
}

/* Compute exit quality for the current trace position.
* Higher values mean better places to stop the trace. */
static inline int32_t
compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode,
const _PyJitTracerState *tracer)
{
if (target_instr == tracer->initial_state.close_loop_instr) {
return EXIT_QUALITY_CLOSE_LOOP;
}
else if (target_instr->op.code == ENTER_EXECUTOR && !_PyJit_EnterExecutorShouldStopTracing(opcode)) {
return EXIT_QUALITY_ENTER_EXECUTOR;
}
else if (opcode == JUMP_BACKWARD_JIT ||
opcode == JUMP_BACKWARD ||
opcode == JUMP_BACKWARD_NO_INTERRUPT) {
return EXIT_QUALITY_BACKWARD_EDGE;
}
else if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) {
return EXIT_QUALITY_SPECIALIZABLE;
}
return EXIT_QUALITY_DEFAULT;
}

/* Frame penalty: (MAX_ABSTRACT_FRAME_DEPTH-1) pushes exhaust fitness. */
static inline int32_t
compute_frame_penalty(uint16_t fitness_initial)
{
return (int32_t)fitness_initial / (MAX_ABSTRACT_FRAME_DEPTH - 1) + 1;
}

static int
is_terminator(const _PyUOpInstruction *uop)
{
Expand Down Expand Up @@ -734,13 +774,11 @@ _PyJit_translate_single_bytecode_to_trace(
DPRINTF(2, "Unsupported: oparg too large\n");
unsupported:
{
// Rewind to previous instruction and replace with _EXIT_TRACE.
_PyUOpInstruction *curr = uop_buffer_last(trace);
while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) {
trace->next--;
curr = uop_buffer_last(trace);
}
assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2);
if (curr->opcode == _SET_IP) {
int32_t old_target = (int32_t)uop_get_target(curr);
curr->opcode = _DEOPT;
Expand All @@ -763,11 +801,29 @@ _PyJit_translate_single_bytecode_to_trace(
return 1;
}

// Stop the trace if fitness has dropped below the exit quality threshold.
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
int32_t eq = compute_exit_quality(target_instr, opcode, tracer);
DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n",
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth);

if (ts->fitness < eq) {
// Heuristic exit: leave operand1=0 so the side exit increments chain_depth.
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
OPT_STAT_INC(fitness_terminated_traces);
DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n",
_PyOpcode_OpName[opcode], oparg, ts->fitness, eq);
goto done;
}

// Snapshot the buffer before reserving tail slots. The later charge
// includes both emitted uops and capacity reserved for exits/deopts/errors.
_PyUOpInstruction *next_before = trace->next;
_PyUOpInstruction *end_before = trace->end;

// One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT
trace->end -= 2;

const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];

assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
assert(!_PyErr_Occurred(tstate));

Expand All @@ -788,13 +844,11 @@ _PyJit_translate_single_bytecode_to_trace(
// _GUARD_IP leads to an exit.
trace->end -= needs_guard_ip;

#if Py_DEBUG
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode));
if (uop_buffer_remaining_space(trace) < space_needed) {
DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n",
space_needed, uop_buffer_remaining_space(trace));
OPT_STAT_INC(trace_too_long);
goto done;
}
assert(uop_buffer_remaining_space(trace) > space_needed);
#endif

ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target);

Expand All @@ -816,36 +870,22 @@ _PyJit_translate_single_bytecode_to_trace(
assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr));
uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened];
ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code));
int bp = compute_branch_penalty(target_instr[1].cache);
tracer->translator_state.fitness -= bp;
DPRINTF(3, " branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n",
bp, target_instr[1].cache, jump_happened,
tracer->translator_state.fitness);

break;
}
case JUMP_BACKWARD_JIT:
// This is possible as the JIT might have re-activated after it was disabled
case JUMP_BACKWARD_NO_JIT:
case JUMP_BACKWARD:
ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
_Py_FALLTHROUGH;
break;
case JUMP_BACKWARD_NO_INTERRUPT:
{
if ((next_instr != tracer->initial_state.close_loop_instr) &&
(next_instr != tracer->initial_state.start_instr) &&
uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS &&
// For side exits, we don't want to terminate them early.
tracer->initial_state.exit == NULL &&
// These are coroutines, and we want to unroll those usually.
opcode != JUMP_BACKWARD_NO_INTERRUPT) {
// We encountered a JUMP_BACKWARD but not to the top of our own loop.
// We don't want to continue tracing as we might get stuck in the
// inner loop. Instead, end the trace where the executor of the
// inner loop might start and let the traces rejoin.
OPT_STAT_INC(inner_loop);
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr,
tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr);
goto done;
}
break;
}

case RESUME:
case RESUME_CHECK:
Expand Down Expand Up @@ -945,6 +985,36 @@ _PyJit_translate_single_bytecode_to_trace(
assert(next->op.code == STORE_FAST);
operand = next->op.arg;
}
else if (uop == _PUSH_FRAME) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
ts_depth->frame_depth++;
assert(ts_depth->frame_depth < MAX_ABSTRACT_FRAME_DEPTH);
int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
ts_depth->fitness -= frame_penalty;
DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d -> fitness=%d\n",
ts_depth->frame_depth, frame_penalty,
ts_depth->fitness);
}
else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
_PyJitTracerTranslatorState *ts_depth = &tracer->translator_state;
int32_t frame_penalty = compute_frame_penalty(tstate->interp->opt_config.fitness_initial);
if (ts_depth->frame_depth <= 0) {
// Returning from a frame we didn't enter — penalize.
ts_depth->fitness -= frame_penalty;
DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n",
_PyOpcode_uop_name[uop], frame_penalty,
ts_depth->fitness);
}
else {
// Symmetric with push: net-zero frame impact.
ts_depth->fitness += frame_penalty;
ts_depth->frame_depth--;
DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n",
_PyOpcode_uop_name[uop], frame_penalty,
ts_depth->frame_depth,
ts_depth->fitness);
}
}
else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) {
PyObject *recorded_value = tracer->prev_state.recorded_value;
tracer->prev_state.recorded_value = NULL;
Expand Down Expand Up @@ -986,13 +1056,23 @@ _PyJit_translate_single_bytecode_to_trace(
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
goto done;
}
DPRINTF(2, "Trace continuing\n");
// Charge fitness by trace-buffer capacity consumed for this bytecode,
// including both emitted uops and tail reservations.
{
int32_t slots_fwd = (int32_t)(trace->next - next_before);
int32_t slots_rev = (int32_t)(end_before - trace->end);
int32_t slots_used = slots_fwd + slots_rev;
tracer->translator_state.fitness -= slots_used;
DPRINTF(3, " per-insn cost: -%d (fwd=%d, rev=%d) -> fitness=%d\n",
slots_used, slots_fwd, slots_rev,
tracer->translator_state.fitness);
}
DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness);
return 1;
done:
DPRINTF(2, "Trace done\n");
if (!is_terminator(uop_buffer_last(trace))) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
uop_buffer_last(trace)->operand1 = true; // is_control_flow
}
return 0;
}
Expand Down Expand Up @@ -1069,6 +1149,13 @@ _PyJit_TryInitializeTracing(
assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL));
tracer->initial_state.jump_backward_instr = curr_instr;

const _PyOptimizationConfig *cfg = &tstate->interp->opt_config;
_PyJitTracerTranslatorState *ts = &tracer->translator_state;
ts->fitness = cfg->fitness_initial;
ts->frame_depth = 0;
DPRINTF(3, "Fitness init: chain_depth=%d, fitness=%d\n",
chain_depth, ts->fitness);

tracer->is_tracing = true;
return 1;
}
Expand Down Expand Up @@ -2101,7 +2188,11 @@ _PyDumpExecutors(FILE *out)
fprintf(out, " node [colorscheme=greys9]\n");
PyInterpreterState *interp = PyInterpreterState_Get();
for (size_t i = 0; i < interp->executor_count; i++) {
executor_to_gv(interp->executor_ptrs[i], out);
_PyExecutorObject *exec = interp->executor_ptrs[i];
if (exec->vm_data.code == NULL) {
continue;
}
executor_to_gv(exec, out);
}
fprintf(out, "}\n\n");
return 0;
Expand Down
5 changes: 5 additions & 0 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,11 @@ init_interpreter(PyInterpreterState *interp,
"PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF",
SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF);

// Trace fitness configuration
init_policy(&interp->opt_config.fitness_initial,
"PYTHON_JIT_FITNESS_INITIAL",
FITNESS_INITIAL, EXIT_QUALITY_CLOSE_LOOP, UOP_MAX_TRACE_LENGTH - 1);

interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF");
interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE");
if (interp != &runtime->_main_interpreter) {
Expand Down
Loading
Loading