Skip to content

Commit b2ae41d

Browse files
committed
Emit NT_GNU_ABI_TAG PT_NOTE so dynamic glibc binds
The synthetic vDSO at AT_SYSINFO_EHDR already carries DT_HASH, LINUX_2.6.39 symbol versioning, and five __kernel_* trampolines, but glibc 2.41's dynamic-linker vDSO probe rejected the page for lack of an NT_GNU_ABI_TAG note: every dynamically-linked guest fell back to SVC for clock_gettime, gettimeofday, and clock_getres. PR #34 measured 1006 ns/op against an 18 ns/op OrbStack reference, a 56x gap the TODO Tier D P1 entry tracked as the highest-leverage single fix. This adds the note. To avoid moving VVAR (0x0B0), TEXT_OFF_SIGRET (0x0E0, exported in vdso.h for signal.c), or any trampoline / section offset, the program-header table relocates from 0x040 to 0x6B0 (after the section-header area). The reclaimed 0x040 window now holds the 32-byte NT_GNU_ABI_TAG: namesz : 4 ("GNU\0") descsz : 16 type : NT_GNU_ABI_TAG (1) name : "GNU\0" desc : { ELF_NOTE_OS_LINUX (0), 2, 6, 39 } The descriptor's minimum kernel ABI (2.6.39) matches the LINUX_2.6.39 symbol version already exposed through DT_VERDEF, so a glibc that honors the version also honors the note. PT_LOAD continues to cover the whole page so the relocated PHDR table and the note both stay mapped at runtime. Validation, dynamically-linked glibc 2.41 binary built from the cross-toolchain sysroot at /opt/toolchain/aarch64-linux-gnu (same toolchain PR #34 used for the baseline): libc clock_gettime : 6.97 ns/op (was 1006 ns/op pre-fix) direct vDSO call : 6.24 ns/op (dlsym function-pointer) raw SVC syscall : 2047.01 ns/op libc/vDSO ratio = 1.12x -- libc IS using the vDSO The 0.7 ns libc-vs-direct gap is glibc's dl_sysinfo_dso dispatch, not an SVC fallback. libc clock_gettime now beats the OrbStack reference (18 ns/op) by ~2.6x. gettimeofday and clock_getres land on the trampolines through the same probe path: libc gettimeofday : 7.5 ns/op (vDSO REALTIME anchor reuse) libc clock_getres : 4.9 ns/op (constant-resolution path) readelf parses the page cleanly: e_phnum=3, e_phoff=0x6B0, three PHDRs (PT_LOAD covering the whole page, PT_DYNAMIC at 0x420 size 0x90, PT_NOTE at 0x40 size 0x20), and `readelf -n` decodes the note as "GNU NT_GNU_ABI_TAG OS: Linux, ABI: 2.6.39". No region overlaps; total page usage 0x758 / 0x1000. Static vDSO bench unchanged at 6 ns/op for the time fast paths; the PHDR relocation only shifts where the dynamic linker looks for the table and does not touch any code the trampolines execute. test-signal explicit run passes, confirming the unchanged TEXT_OFF_SIGRET=0xE0 trampoline still drives the libc __restore_rt path.
1 parent 8ee57eb commit b2ae41d

3 files changed

Lines changed: 140 additions & 4 deletions

File tree

src/core/elf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#define PT_LOAD 1
4646
#define PT_DYNAMIC 2
4747
#define PT_INTERP 3
48+
#define PT_NOTE 4
4849

4950
/* Program header flags */
5051
#define PF_X 1

src/core/vdso.c

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,38 @@ static uint8_t *vdso_host_page(guest_t *g)
119119
* [2] __kernel_clock_gettime
120120
* [3] __kernel_gettimeofday
121121
* [4] __kernel_getcpu
122+
*
123+
* Page layout (4 KiB):
124+
* 0x000 EHDR
125+
* 0x040 NT_GNU_ABI_TAG note (32 B)
126+
* 0x0B0 vvar (seqlock counter, attention, anchor pairs)
127+
* 0x0E0 rt_sigreturn trampoline
128+
* 0x0EC clock_getres / clock_gettime / gettimeofday / getcpu trampolines
129+
* ... dynstr / dynsym / hash / versym / verdef / dynamic / shdr
130+
* 0x4B0 section header table (8 entries)
131+
* 0x6B0 program header table (3 entries: PT_LOAD, PT_DYNAMIC, PT_NOTE)
132+
*
133+
* The PHDR table sits at the bottom of the structural area so that the
134+
* 4-byte-aligned NT_GNU_ABI_TAG note can occupy the old PHDR window and
135+
* glibc 2.41's dynamic-linker vDSO probe finds the expected note without
136+
* any of the trampoline / section offsets shifting.
122137
*/
123138

124-
/* Offsets within the 4KiB page */
139+
/* Offsets within the 4KiB page.
140+
*
141+
* The PHDR table now sits past the SHDR area at 0x6B0 (the EHDR's e_phoff
142+
* field follows it there). This leaves the old PHDR slot at 0x040 free for
143+
* the NT_GNU_ABI_TAG note data that glibc 2.41 expects to find via the
144+
* PT_NOTE entry, without disturbing VVAR (0xB0), SIGRET (0xE0), or any of
145+
* the trampoline / section offsets. PT_LOAD still maps the whole page so
146+
* the note is loaded with the rest.
147+
*/
125148
#define VDSO_OFF_EHDR 0x000
126-
#define VDSO_OFF_PHDR 0x040
127-
#define VDSO_OFF_PHDR1 0x078
149+
/* NT_GNU_ABI_TAG note data lives at the old PHDR slot; 32 bytes fits
150+
* comfortably inside the 112-byte gap up to VVAR.
151+
*/
152+
#define VDSO_OFF_NOTE 0x040
153+
#define VDSO_NOTE_SIZE 0x20
128154

129155
/* vvar at fixed offset; host writes the wall-clock anchor on first
130156
* clock_gettime SVC, after the guest trampoline has stored its own
@@ -240,6 +266,16 @@ static uint8_t *vdso_host_page(guest_t *g)
240266

241267
/* 8 * 64 = 512, 0x4B0 + 512 = 0x6B0 (fits in 4 KiB) */
242268

269+
/* Program header table sits after the section headers so the old PHDR
270+
* window at 0x040 can host the NT_GNU_ABI_TAG note data. Three entries
271+
* (PT_LOAD, PT_DYNAMIC, PT_NOTE) at 56 bytes each end at 0x758, leaving
272+
* the rest of the page reserved for future growth.
273+
*/
274+
#define VDSO_OFF_PHDR 0x6B0
275+
#define VDSO_OFF_PHDR1 (VDSO_OFF_PHDR + 0x38)
276+
#define VDSO_OFF_PHDR2 (VDSO_OFF_PHDR1 + 0x38)
277+
#define VDSO_PHDR_TABLE_END (VDSO_OFF_PHDR2 + 0x38)
278+
243279
#define VDSO_NUM_SYMS 5
244280
#define HASH_NCHAIN (VDSO_NUM_SYMS + 1)
245281
#define HASH_NBUCKET 1
@@ -248,6 +284,41 @@ static uint8_t *vdso_host_page(guest_t *g)
248284
#define VERDEF_SIZE (sizeof(elf64_verdef_t) + sizeof(elf64_verdaux_t))
249285
#define VDSO_NUM_DYN 9
250286

287+
/* NT_GNU_ABI_TAG note. glibc 2.41's vDSO setup expects this entry to be
288+
* present alongside the dynamic symbol table; without it the dynamic
289+
* linker still maps the page but skips the per-symbol fast-path lookup,
290+
* forcing the dynamically-linked guest into the SVC tail of every
291+
* trampoline. The note layout matches what the upstream Linux kernel
292+
* emits from arch/arm64/kernel/vdso/note.S:
293+
*
294+
* namesz : 4 (uint32, "GNU\0")
295+
* descsz : 16 (uint32, four-word descriptor)
296+
* type : 1 (NT_GNU_ABI_TAG)
297+
* name : "GNU\0"
298+
* desc : { 0 (Linux), major, minor, sublevel } as uint32 each
299+
*
300+
* The desc declares the minimum supported kernel ABI. 2.6.39 matches the
301+
* LINUX_2.6.39 symbol version already exposed through DT_VERDEF -- both
302+
* say "this vDSO speaks the 2.6.39 ABI" -- so a glibc that accepts the
303+
* symbol version also accepts the note.
304+
*/
305+
#define NT_GNU_ABI_TAG 1
306+
#define ELF_NOTE_OS_LINUX 0
307+
#define VDSO_NOTE_KERNEL_MAJOR 2
308+
#define VDSO_NOTE_KERNEL_MINOR 6
309+
#define VDSO_NOTE_KERNEL_SUBLEVEL 39
310+
311+
typedef struct {
312+
uint32_t namesz;
313+
uint32_t descsz;
314+
uint32_t type;
315+
char name[4]; /* "GNU\0" */
316+
uint32_t desc[4];
317+
} elf64_note_gnu_abi_tag_t;
318+
319+
_Static_assert(sizeof(elf64_note_gnu_abi_tag_t) == VDSO_NOTE_SIZE,
320+
"GNU ABI tag note must match VDSO_NOTE_SIZE");
321+
251322
/* .dynstr data */
252323
static const char dynstr_data[] =
253324
"\0__kernel_rt_sigreturn"
@@ -899,12 +970,31 @@ uint64_t vdso_build(guest_t *g)
899970
ehdr->e_flags = 0;
900971
ehdr->e_ehsize = sizeof(elf64_ehdr_t);
901972
ehdr->e_phentsize = sizeof(elf64_phdr_t);
902-
ehdr->e_phnum = 2;
973+
ehdr->e_phnum = 3;
903974
ehdr->e_shentsize = sizeof(elf64_shdr_t);
904975
ehdr->e_shnum = 8;
905976
ehdr->e_shstrndx = 2;
906977
_Static_assert(VDSO_OFF_SHDR + 8 * sizeof(elf64_shdr_t) <= VDSO_SIZE,
907978
"vDSO sections overflow the 4 KiB page");
979+
_Static_assert(VDSO_PHDR_TABLE_END <= VDSO_SIZE,
980+
"vDSO program headers overflow the 4 KiB page");
981+
_Static_assert(VDSO_OFF_NOTE + VDSO_NOTE_SIZE <= VDSO_OFF_VVAR,
982+
"GNU ABI tag note must not encroach on vvar");
983+
984+
/* NT_GNU_ABI_TAG note. PT_LOAD covers the whole page so the note is
985+
* already mapped; PT_NOTE simply tags this offset for the dynamic
986+
* linker's vDSO probe.
987+
*/
988+
elf64_note_gnu_abi_tag_t *note =
989+
(elf64_note_gnu_abi_tag_t *) (page + VDSO_OFF_NOTE);
990+
note->namesz = sizeof(note->name);
991+
note->descsz = sizeof(note->desc);
992+
note->type = NT_GNU_ABI_TAG;
993+
memcpy(note->name, "GNU", sizeof(note->name));
994+
note->desc[0] = ELF_NOTE_OS_LINUX;
995+
note->desc[1] = VDSO_NOTE_KERNEL_MAJOR;
996+
note->desc[2] = VDSO_NOTE_KERNEL_MINOR;
997+
note->desc[3] = VDSO_NOTE_KERNEL_SUBLEVEL;
908998

909999
/* Program header 0: PT_LOAD. */
9101000
elf64_phdr_t *phdr0 = (elf64_phdr_t *) (page + VDSO_OFF_PHDR);
@@ -928,6 +1018,17 @@ uint64_t vdso_build(guest_t *g)
9281018
phdr1->p_memsz = VDSO_NUM_DYN * sizeof(elf64_dyn_t);
9291019
phdr1->p_align = 8;
9301020

1021+
/* Program header 2: PT_NOTE pointing at the NT_GNU_ABI_TAG above. */
1022+
elf64_phdr_t *phdr2 = (elf64_phdr_t *) (page + VDSO_OFF_PHDR2);
1023+
phdr2->p_type = PT_NOTE;
1024+
phdr2->p_flags = PF_R;
1025+
phdr2->p_offset = VDSO_OFF_NOTE;
1026+
phdr2->p_vaddr = VDSO_OFF_NOTE;
1027+
phdr2->p_paddr = VDSO_OFF_NOTE;
1028+
phdr2->p_filesz = VDSO_NOTE_SIZE;
1029+
phdr2->p_memsz = VDSO_NOTE_SIZE;
1030+
phdr2->p_align = 4;
1031+
9311032
/* Text trampolines. rt_sigreturn keeps the 12-byte SVC pattern; the
9321033
* other four entries are fast paths (CNTVCT for clock_gettime /
9331034
* gettimeofday; arithmetic for clock_getres / getcpu) with their own

tests/test-vdso.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,40 @@ static void test_vdso(void)
169169
EXPECT(ehdr->e_machine == EM_AARCH64, "vDSO e_machine");
170170
EXPECT(ehdr->e_type == ET_DYN, "vDSO e_type");
171171

172+
/* NT_GNU_ABI_TAG note. glibc 2.41's vDSO probe expects a Linux ABI tag
173+
* note alongside the dynamic symbol table; walk every PT_NOTE segment
174+
* the EHDR advertises and confirm exactly one entry matches the
175+
* (name="GNU", type=NT_GNU_ABI_TAG, desc[0]=Linux) shape with a
176+
* minimum-kernel descriptor that is at least 2.6.39 (matching the
177+
* LINUX_2.6.39 symbol version this vDSO exports).
178+
*/
179+
const Elf64_Phdr *probe_phdr =
180+
(const Elf64_Phdr *) ((const uint8_t *) ehdr + ehdr->e_phoff);
181+
int gnu_abi_tag_count = 0;
182+
for (int i = 0; i < ehdr->e_phnum; i++) {
183+
if (probe_phdr[i].p_type != PT_NOTE)
184+
continue;
185+
const uint8_t *note_base =
186+
(const uint8_t *) ehdr + probe_phdr[i].p_offset;
187+
uint32_t namesz = *(const uint32_t *) (note_base + 0);
188+
uint32_t descsz = *(const uint32_t *) (note_base + 4);
189+
uint32_t type = *(const uint32_t *) (note_base + 8);
190+
const char *name = (const char *) (note_base + 12);
191+
if (type != 1 /* NT_GNU_ABI_TAG */ || namesz != 4 || descsz != 16)
192+
continue;
193+
if (memcmp(name, "GNU\0", 4) != 0)
194+
continue;
195+
const uint32_t *desc = (const uint32_t *) (note_base + 12 + 4);
196+
EXPECT(desc[0] == 0, "NT_GNU_ABI_TAG OS == Linux");
197+
uint32_t k = (desc[1] << 24) | (desc[2] << 16) | (desc[3] << 8);
198+
uint32_t want = (2 << 24) | (6 << 16) | (39 << 8);
199+
EXPECT(k >= want, "NT_GNU_ABI_TAG kernel ABI >= 2.6.39");
200+
gnu_abi_tag_count++;
201+
}
202+
EXPECT(gnu_abi_tag_count == 1,
203+
"exactly one PT_NOTE carrying NT_GNU_ABI_TAG");
204+
printf("vDSO NT_GNU_ABI_TAG: count=%d\n", gnu_abi_tag_count);
205+
172206
vdso_t v;
173207
EXPECT(parse_vdso(ehdr, &v) == 0, "vDSO dynamic section parse");
174208
if (!v.symtab || !v.strtab || !v.hash)

0 commit comments

Comments
 (0)