Skip to content

Commit 958b7cf

Browse files
committed
fix(callgrind): seed shadow call stack from native stack at START
When CALLGRIND_START_INSTRUMENTATION fires mid-stack (typical for pytest-codspeed: Python reaches the macro several libpython frames deep), callgrind's csp is 0 but the real stack is non-empty. Every subsequent client `ret` peels a frame callgrind never saw the matching `call` for, trips handleUnderflow, and leaks the returned-into fn as a top-level fn= block — polluting the flamegraph with phantom roots like _PyEval_EvalFrameDefault, PyObject_Vectorcall, etc. Reconstruct the shadow stack from VG_(get_StackTrace) on the OFF->ON transition. For each native frame: push a (jcc=0, skip-style) call_entry with the captured SP and ret_addr=caller_ip+1. For non-skipped caller frames, synthesize a zero-instruction BBCC tagged with that frame's cxt so obj-skip's `nonskipped` mechanism has a target to fold skipped-subtree costs into. Anonymous IPs (Python JIT regions, CRT glue) resolve via the existing `???` obj path in get_obj_node, so no special trimming is needed. Un-static new_recursion and insert_bbcc_into_hash (renamed to CLG_-prefixed) so callstack.c can reuse them.
1 parent 6fbd12b commit 958b7cf

5 files changed

Lines changed: 178 additions & 63 deletions

File tree

callgrind/bbcc.c

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ void CLG_(zero_bbcc)(BBCC* bbcc)
8585

8686
CLG_ASSERT(bbcc->cxt != 0);
8787
CLG_DEBUG(1, " zero_bbcc: BB %#lx, Cxt %u "
88-
"(fn '%s', rec %u)\n",
88+
"(fn '%s', rec %u)\n",
8989
bb_addr(bbcc->bb),
9090
bbcc->cxt->base_number + bbcc->rec_index,
9191
bbcc->cxt->fn[0]->name,
@@ -113,7 +113,7 @@ void CLG_(forall_bbccs)(void (*func)(BBCC*))
113113
{
114114
BBCC *bbcc, *bbcc2;
115115
int i, j;
116-
116+
117117
for (i = 0; i < current_bbccs.size; i++) {
118118
if ((bbcc=current_bbccs.table[i]) == NULL) continue;
119119
while (bbcc) {
@@ -149,10 +149,10 @@ UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
149149

150150
return ((Addr)bb + (Addr)cxt) % size;
151151
}
152-
152+
153153

154154
/* Lookup for a BBCC in hash.
155-
*/
155+
*/
156156
static
157157
BBCC* lookup_bbcc(BB* bb, Context* cxt)
158158
{
@@ -177,9 +177,9 @@ BBCC* lookup_bbcc(BB* bb, Context* cxt)
177177
cxt != bbcc->cxt)) {
178178
bbcc = bbcc->next;
179179
}
180-
180+
181181
CLG_DEBUG(2," lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
182-
bb_addr(bb), cxt->base_number, cxt->fn[0]->name,
182+
bb_addr(bb), cxt->base_number, cxt->fn[0]->name,
183183
bbcc, bbcc ? bbcc->tid : 0);
184184

185185
CLG_DEBUGIF(2)
@@ -200,13 +200,13 @@ static void resize_bbcc_hash(void)
200200
new_size = 2*current_bbccs.size+3;
201201
new_table = (BBCC**) CLG_MALLOC("cl.bbcc.rbh.1",
202202
new_size * sizeof(BBCC*));
203-
203+
204204
for (i = 0; i < new_size; i++)
205205
new_table[i] = NULL;
206-
206+
207207
for (i = 0; i < current_bbccs.size; i++) {
208208
if (current_bbccs.table[i] == NULL) continue;
209-
209+
210210
curr_BBCC = current_bbccs.table[i];
211211
while (NULL != curr_BBCC) {
212212
next_BBCC = curr_BBCC->next;
@@ -254,15 +254,15 @@ BBCC** new_recursion(int size)
254254

255255
return bbccs;
256256
}
257-
257+
258258

259259
/*
260260
* Allocate a new BBCC
261261
*
262262
* Uninitialized:
263263
* cxt, rec_index, rec_array, next_bbcc, next1, next2
264264
*/
265-
static __inline__
265+
static __inline__
266266
BBCC* new_bbcc(BB* bb)
267267
{
268268
BBCC* bbcc;
@@ -292,7 +292,7 @@ BBCC* new_bbcc(BB* bb)
292292
bbcc->lru_next_bbcc = 0;
293293
bbcc->lru_from_jcc = 0;
294294
bbcc->lru_to_jcc = 0;
295-
295+
296296
CLG_(stat).distinct_bbccs++;
297297

298298
CLG_DEBUG(3, " new_bbcc(BB %#lx): %p (now %d)\n",
@@ -317,7 +317,7 @@ static
317317
void insert_bbcc_into_hash(BBCC* bbcc)
318318
{
319319
UInt idx;
320-
320+
321321
CLG_ASSERT(bbcc->cxt != 0);
322322

323323
CLG_DEBUG(3,"+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n",
@@ -389,10 +389,10 @@ static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
389389

390390
bbcc->rec_index = 0;
391391
bbcc->cxt = cxt;
392-
bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
392+
bbcc->rec_array = CLG_(new_recursion)(cxt->fn[0]->separate_recursions);
393393
bbcc->rec_array[0] = bbcc;
394394

395-
insert_bbcc_into_hash(bbcc);
395+
CLG_(insert_bbcc_into_hash)(bbcc);
396396
}
397397
else {
398398
if (CLG_(clo).separate_threads)
@@ -430,7 +430,7 @@ static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
430430
CLG_FREE(mangled_bbcc);
431431

432432
CLG_(stat).bbcc_clones++;
433-
433+
434434
return bbcc;
435435
};
436436

@@ -440,7 +440,7 @@ static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
440440
* address. If created, the BBCC is inserted into the BBCC hash.
441441
* Also sets BB_seen_before by reference.
442442
*
443-
*/
443+
*/
444444
BBCC* CLG_(get_bbcc)(BB* bb)
445445
{
446446
BBCC* bbcc;
@@ -508,7 +508,7 @@ static void handleUnderflow(BB* bb)
508508
}
509509
else if (CLG_(current_state).collect)
510510
source_bbcc->ecounter_sum++;
511-
511+
512512
/* Force a new top context, will be set active by push_cxt() */
513513
CLG_(current_fn_stack).top--;
514514
CLG_(current_state).cxt = 0;
@@ -522,12 +522,12 @@ static void handleUnderflow(BB* bb)
522522

523523
if (!seen_before) {
524524
/* set rec array for source BBCC: this is at rec level 1 */
525-
source_bbcc->rec_array = new_recursion(caller->separate_recursions);
525+
source_bbcc->rec_array = CLG_(new_recursion)(caller->separate_recursions);
526526
source_bbcc->rec_array[0] = source_bbcc;
527527

528528
CLG_ASSERT(source_bbcc->cxt == 0);
529529
source_bbcc->cxt = CLG_(current_state).cxt;
530-
insert_bbcc_into_hash(source_bbcc);
530+
CLG_(insert_bbcc_into_hash)(source_bbcc);
531531
}
532532
CLG_ASSERT(CLG_(current_state).bbcc);
533533

@@ -545,7 +545,7 @@ static void handleUnderflow(BB* bb)
545545
CLG_(push_cxt)( CLG_(current_state).bbcc->cxt->fn[0] );
546546
CLG_(push_call_stack)(source_bbcc, 0, CLG_(current_state).bbcc,
547547
(Addr)-1, False);
548-
call_entry_up =
548+
call_entry_up =
549549
&(CLG_(current_call_stack).entry[CLG_(current_call_stack).sp -1]);
550550
/* assume this call is lasting since last dump or
551551
* for a signal handler since it's call */
@@ -611,7 +611,7 @@ void CLG_(setup_bbcc)(BB* bb)
611611
last_bbcc->ecounter_sum++;
612612
last_bbcc->jmp[passed].ecounter++;
613613
if (!CLG_(clo).simulate_cache) {
614-
/* update Ir cost */
614+
/* update Ir cost */
615615
UInt instr_count = last_bb->jmp[passed].instr+1;
616616
CLG_(current_state).cost[ fullOffset(EG_IR) ] += instr_count;
617617
}
@@ -645,7 +645,7 @@ void CLG_(setup_bbcc)(BB* bb)
645645

646646
/* A return not matching the top call in our callstack is a jump */
647647
if ( (jmpkind == jk_Return) && (csp >0)) {
648-
Int csp_up = csp-1;
648+
Int csp_up = csp-1;
649649
call_entry* top_ce = &(CLG_(current_call_stack).entry[csp_up]);
650650

651651
/* We have a real return if
@@ -666,7 +666,7 @@ void CLG_(setup_bbcc)(BB* bb)
666666
top_ce = &(CLG_(current_call_stack).entry[csp_up]);
667667
if (top_ce->sp == sp) {
668668
popcount_on_return++;
669-
continue;
669+
continue;
670670
}
671671
}
672672
popcount_on_return = 0;
@@ -709,7 +709,7 @@ void CLG_(setup_bbcc)(BB* bb)
709709
if (CLG_(get_fn_node)(last_bb)->pop_on_jump && (csp>0)) {
710710

711711
call_entry* top_ce = &(CLG_(current_call_stack).entry[csp-1]);
712-
712+
713713
if (top_ce->jcc) {
714714

715715
CLG_DEBUG(1," Pop on Jump!\n");
@@ -784,16 +784,16 @@ void CLG_(setup_bbcc)(BB* bb)
784784
}
785785

786786
/* Handle CALL/RET and update context to get correct BBCC */
787-
787+
788788
if (jmpkind == jk_Return) {
789-
790-
if ((csp == 0) ||
789+
790+
if ((csp == 0) ||
791791
((CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom) &&
792792
( *(CLG_(current_fn_stack).top-1)==0)) ) {
793793

794794
/* On an empty call stack or at a signal separation marker,
795795
* a RETURN generates an call stack underflow.
796-
*/
796+
*/
797797
handleUnderflow(bb);
798798
CLG_(pop_call_stack)();
799799
}
@@ -808,17 +808,17 @@ void CLG_(setup_bbcc)(BB* bb)
808808
/* if unwinding was done, this actually is a return */
809809
jmpkind = jk_Return;
810810
}
811-
811+
812812
if (jmpkind == jk_Call) {
813813
delayed_push = True;
814814

815815
csp = CLG_(current_call_stack).sp;
816816
if (call_emulation && csp>0)
817-
sp = CLG_(current_call_stack).entry[csp-1].sp;
817+
sp = CLG_(current_call_stack).entry[csp-1].sp;
818818

819819
}
820820
}
821-
821+
822822
/* Change new context if needed, taking delayed_push into account.
823823
*
824824
* The `cxt == 0` clause used to fire regardless of skip, which meant
@@ -841,24 +841,24 @@ void CLG_(setup_bbcc)(BB* bb)
841841
CLG_(push_cxt)(push_fn);
842842
}
843843
CLG_ASSERT(CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom);
844-
844+
845845
/* If there is a fresh instrumented BBCC, assign current context */
846846
bbcc = CLG_(get_bbcc)(bb);
847847
if (bbcc->cxt == 0) {
848848
CLG_ASSERT(bbcc->rec_array == 0);
849-
849+
850850
bbcc->cxt = CLG_(current_state).cxt;
851-
bbcc->rec_array =
852-
new_recursion((*CLG_(current_fn_stack).top)->separate_recursions);
851+
bbcc->rec_array =
852+
CLG_(new_recursion)((*CLG_(current_fn_stack).top)->separate_recursions);
853853
bbcc->rec_array[0] = bbcc;
854-
855-
insert_bbcc_into_hash(bbcc);
854+
855+
CLG_(insert_bbcc_into_hash)(bbcc);
856856
}
857857
else {
858858
/* get BBCC with current context */
859-
859+
860860
/* first check LRU of last bbcc executed */
861-
861+
862862
if (last_bbcc) {
863863
bbcc = last_bbcc->lru_next_bbcc;
864864
if (bbcc &&
@@ -873,7 +873,7 @@ void CLG_(setup_bbcc)(BB* bb)
873873
bbcc = lookup_bbcc(bb, CLG_(current_state).cxt);
874874
if (!bbcc)
875875
bbcc = clone_bbcc(bb->bbcc_list, CLG_(current_state).cxt, 0);
876-
876+
877877
bb->last_bbcc = bbcc;
878878
}
879879

@@ -891,7 +891,7 @@ void CLG_(setup_bbcc)(BB* bb)
891891
if (CLG_(clo).skip_direct_recursion) {
892892
/* a call was detected, which means that the source BB != 0 */
893893
CLG_ASSERT(CLG_(current_state).bbcc != 0);
894-
/* only increment rec. level if called from different function */
894+
/* only increment rec. level if called from different function */
895895
if (CLG_(current_state).bbcc->cxt->fn[0] != bbcc->cxt->fn[0])
896896
level++;
897897
}
@@ -927,10 +927,10 @@ void CLG_(setup_bbcc)(BB* bb)
927927
}
928928

929929
if (CLG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
930-
930+
931931
/* Handle conditional jumps followed, i.e. trace arcs
932932
* This uses JCC structures, too */
933-
933+
934934
jCC* jcc = CLG_(get_jcc)(last_bbcc, passed, bbcc);
935935
CLG_ASSERT(jcc != 0);
936936
// Change from default, and check if already changed
@@ -940,14 +940,14 @@ void CLG_(setup_bbcc)(BB* bb)
940940
// FIXME: Why can this fail?
941941
// CLG_ASSERT(jcc->jmpkind == jmpkind);
942942
}
943-
943+
944944
jcc->call_counter++;
945945
if (isConditionalJump)
946946
CLG_(stat).jcnd_counter++;
947947
else
948948
CLG_(stat).jump_counter++;
949949
}
950-
950+
951951
CLG_(current_state).bbcc = bbcc;
952952
/* Even though this will be set in instrumented code directly before
953953
* side exits, it needs to be set to 0 here in case an exception
@@ -956,19 +956,19 @@ void CLG_(setup_bbcc)(BB* bb)
956956
// needed for log_* handlers called in this BB
957957
CLG_(bb_base) = bb->obj->offset + bb->offset;
958958
CLG_(cost_base) = bbcc->cost;
959-
959+
960960
CLG_DEBUGIF(1) {
961961
VG_(printf)(" ");
962962
CLG_(print_bbcc_fn)(bbcc);
963963
VG_(printf)("\n");
964964
}
965-
965+
966966
CLG_DEBUG(3,"- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
967-
bb_addr(bb), bbcc->cost, bb->cost_count,
967+
bb_addr(bb), bbcc->cost, bb->cost_count,
968968
bb->instr_count, bb->instr_len);
969969
CLG_DEBUGIF(3)
970970
CLG_(print_cxt)(-8, CLG_(current_state).cxt, bbcc->rec_index);
971971
CLG_DEBUG(3,"\n");
972-
972+
973973
CLG_(stat).bb_executions++;
974974
}

0 commit comments

Comments
 (0)