Skip to content

Commit 6e96b9d

Browse files
committed
Load trampoline directly from file
1 parent 8d2d8cc commit 6e96b9d

2 files changed

Lines changed: 95 additions & 80 deletions

File tree

ext/solib_bootstrap.c

Lines changed: 94 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,10 @@ static void parse_stack(void *stack_top, struct boot_args *args);
9393
static const char *find_env(char **envp, const char *name);
9494
static Elf64_auxv_t *find_auxv_entry(Elf64_auxv_t *auxv, unsigned long type);
9595
static unsigned long get_auxv(Elf64_auxv_t *auxv, unsigned long type);
96-
static int elf_load_from_memory(const void *src, size_t src_len,
97-
struct trampoline_map *out, long page_size);
96+
static int elf_map_segments(int fd, long file_bias, const Elf64_Phdr *phdrs,
97+
int phnum, uintptr_t base, long page_size);
98+
static int elf_load_trampoline(const void *src, size_t src_len,
99+
struct trampoline_map *out, long page_size);
98100
static int elf_load(const char *path, struct loaded_lib *lib, long page_size);
99101
static int create_patched_memfd(void);
100102
static int uint_to_dec(unsigned int v, char *buf);
@@ -117,6 +119,7 @@ static long sys_write(int fd, const void *buf, long count);
117119
static int sys_close(int fd);
118120
static void *sys_mmap(void *addr, long length, int prot, int flags, int fd, long offset);
119121
static int sys_munmap(void *addr, long length);
122+
120123
static noreturn void sys_exit_group(int status);
121124
static int sys_memfd_create(const char *name, unsigned int flags);
122125
static long sys_sendfile(int out_fd, int in_fd, long count);
@@ -243,7 +246,7 @@ noreturn void _dd_solib_bootstrap(void *stack_top) {
243246
bs_fatal("TRAMPOLINE_BIN not available", 120);
244247

245248
struct trampoline_map tmap;
246-
if (elf_load_from_memory(trampoline_bytes, trampoline_len, &tmap, page_size) < 0)
249+
if (elf_load_trampoline(trampoline_bytes, trampoline_len, &tmap, page_size) < 0)
247250
bs_fatal("failed to map trampoline", 121);
248251

249252
// Step 2: Redirect auxv to the trampoline
@@ -609,61 +612,112 @@ static int elf_reserve(const Elf64_Phdr *phdrs, int phnum, long page_size,
609612
}
610613

611614
// }}}
612-
// ---- Load ELF from memory (for trampoline embedded in ddtrace.so) {{{
615+
// ---- Common PT_LOAD segment mapper {{{
616+
//
617+
// Maps all PT_LOAD segments from an open fd into a pre-reserved address space.
618+
// `file_bias` is added to each segment's page-aligned file offset:
619+
// * pass 0 for a standalone file (e.g. ld.so loaded from its own path)
620+
// * pass (DD_TRAMPOLINE_BIN.ptr - &__ehdr_start) for the trampoline embedded
621+
// in /proc/self/exe
622+
//
623+
// Does NOT close fd on failure; caller is responsible.
624+
//
625+
626+
static int elf_map_segments(int fd, long file_bias, const Elf64_Phdr *phdrs,
627+
int phnum, uintptr_t base, long page_size) {
628+
if (file_bias != BS_PAGE_DOWN(file_bias, page_size)) {
629+
bs_fatal("file_bias not page-aligned", 123);
630+
__builtin_unreachable();
631+
}
632+
633+
for (int i = 0; i < phnum; i++) {
634+
if (phdrs[i].p_type != PT_LOAD) continue;
635+
636+
uintptr_t seg_start = BS_PAGE_DOWN(phdrs[i].p_vaddr, page_size);
637+
uintptr_t seg_file_end = phdrs[i].p_vaddr + phdrs[i].p_filesz;
638+
uintptr_t seg_mem_end = phdrs[i].p_vaddr + phdrs[i].p_memsz;
639+
uintptr_t file_page_end = BS_PAGE_UP(seg_file_end, page_size);
640+
uintptr_t mem_page_end = BS_PAGE_UP(seg_mem_end, page_size);
641+
int prot = elf_pf_to_prot(phdrs[i].p_flags);
642+
643+
if (phdrs[i].p_filesz > 0) {
644+
// ELF spec (gABI): p_vaddr ≡ p_offset (mod p_align), so
645+
// PAGE_DOWN(p_offset) places p_vaddr at the correct address.
646+
long file_offset = file_bias + (long)BS_PAGE_DOWN(phdrs[i].p_offset, page_size);
647+
long file_map_len = (long)(file_page_end - seg_start);
648+
void *seg = sys_mmap((void *)(base + seg_start), file_map_len,
649+
prot, BS_MAP_PRIVATE | BS_MAP_FIXED, fd, file_offset);
650+
if (seg == BS_MAP_FAILED) return -1;
651+
652+
// Zero tail within the last file-backed page (writable only).
653+
// Both glibc and musl do this dlopen. One can't trus the linkers to
654+
// have the zeros in the file.
655+
if (seg_mem_end > seg_file_end && (phdrs[i].p_flags & PF_W))
656+
bs_memset((void *)(base + seg_file_end), 0,
657+
(long)(file_page_end - seg_file_end));
658+
}
659+
660+
// Anonymous pages for BSS. For pure-BSS segments (p_filesz==0) start
661+
// at seg_start; for mixed segments start after the last file-backed page.
662+
uintptr_t anon_start = (phdrs[i].p_filesz > 0) ? file_page_end : seg_start;
663+
if (mem_page_end > anon_start) {
664+
void *bss = sys_mmap((void *)(base + anon_start),
665+
(long)(mem_page_end - anon_start),
666+
prot,
667+
BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS,
668+
-1, 0);
669+
if (bss == BS_MAP_FAILED) return -1;
670+
}
671+
}
672+
return 0;
673+
}
674+
675+
// }}}
676+
// ---- Load trampoline ELF from /proc/self/exe {{{
613677
//
614-
// Like elf_load() but reads from an in-memory byte array instead of a file.
615-
// Maps each PT_LOAD segment into anonymous memory and copies bytes from src.
678+
// The trampoline binary is embedded in ddtrace.so at DD_TRAMPOLINE_BIN.ptr.
679+
// Since ddtrace.so is /proc/self/exe when executed directly, we open that file
680+
// and mmap each PT_LOAD segment directly from it with the correct permissions.
616681

617-
static int elf_load_from_memory(const void *src, size_t src_len,
618-
struct trampoline_map *out, long page_size) {
682+
static int elf_load_trampoline(const void *src, size_t src_len,
683+
struct trampoline_map *out, long page_size) {
619684
bs_memset(out, 0, sizeof(*out));
620685

621686
if (src_len < sizeof(Elf64_Ehdr)) return -1;
622687
const Elf64_Ehdr *ehdr = (const Elf64_Ehdr *)src;
623688

624689
if (elf_check_header(ehdr, 1u << ET_DYN, 32) < 0) return -1;
625-
// Bounds-check the phdr table within the source buffer
626690
if (ehdr->e_phoff + (uint64_t)ehdr->e_phnum * sizeof(Elf64_Phdr) > src_len) return -1;
627691

628692
const Elf64_Phdr *phdrs = (const Elf64_Phdr *)((const char *)src + ehdr->e_phoff);
629693

630-
uintptr_t base; long total;
631-
if (elf_reserve(phdrs, ehdr->e_phnum, page_size, &base, &total) < 0) return -1;
694+
// Compute the file offset of the trampoline ELF within /proc/self/exe.
695+
// __ehdr_start is the runtime load address of ddtrace.so's own ELF header.
696+
uintptr_t tramp_file_bias = (uintptr_t)src - (uintptr_t)&__ehdr_start;
697+
if (tramp_file_bias & ((uintptr_t)page_size - 1))
698+
return -1; // DD_TRAMPOLINE_BIN not page-aligned within ddtrace.so
632699

633-
// Map each PT_LOAD segment into anonymous memory and copy bytes from src.
634-
// ET_EXEC binaries would fail here since we mmap'd at an arbitrary address.
635-
// Segments are left writable; ld.so will re-map from file with correct
636-
// perms.
637-
for (int i = 0; i < ehdr->e_phnum; i++) {
638-
if (phdrs[i].p_type != PT_LOAD) continue;
700+
int fd = sys_open_rdonly("/proc/self/exe");
701+
if (fd < 0) return -1;
639702

640-
uintptr_t seg_vaddr = phdrs[i].p_vaddr;
641-
uintptr_t seg_start = BS_PAGE_DOWN(seg_vaddr, page_size);
642-
uintptr_t seg_mem_end = seg_vaddr + phdrs[i].p_memsz;
643-
long map_len = (long)(BS_PAGE_UP(seg_mem_end, page_size) - seg_start);
644-
void *seg = sys_mmap((void *)(base + seg_start), map_len,
645-
elf_pf_to_prot(phdrs[i].p_flags) | BS_PROT_WRITE,
646-
BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS, -1, 0);
647-
if (seg == BS_MAP_FAILED) return -1;
648-
649-
long file_bytes = (long)phdrs[i].p_filesz;
650-
if ((long)(phdrs[i].p_offset + phdrs[i].p_filesz) > (long)src_len)
651-
file_bytes = (long)src_len - (long)phdrs[i].p_offset;
652-
if (file_bytes > 0)
653-
bs_memcpy((void *)(base + seg_vaddr),
654-
(const char *)src + phdrs[i].p_offset, file_bytes);
655-
// BSS (memsz > filesz) is already zero (MAP_ANONYMOUS)
703+
uintptr_t base; long total;
704+
if (elf_reserve(phdrs, ehdr->e_phnum, page_size, &base, &total) < 0) {
705+
sys_close(fd); return -1;
706+
}
707+
if (elf_map_segments(fd, (long)tramp_file_bias, phdrs, ehdr->e_phnum,
708+
base, page_size) < 0) {
709+
sys_close(fd); return -1;
656710
}
711+
sys_close(fd);
657712

658713
out->base = base;
659714
out->entry = base + ehdr->e_entry;
660715
out->phnum = ehdr->e_phnum;
661716
out->total_map = total;
662-
// Use PT_PHDR.p_vaddr for the phdr runtime address when PT_PHDR is present
663-
// e_phoff is a file offset, not a vaddr, and adding it to `base` only
664-
// works when p_vaddr of the first PT_LOAD is 0 (true for all standard PIE
665-
// output, but PT_PHDR.p_vaddr is the correct portable source).
666-
out->phdr = base + ehdr->e_phoff; // fallback: works when first PT_LOAD p_vaddr==0
717+
// Use PT_PHDR.p_vaddr for the phdr runtime address when present;
718+
// e_phoff is a file offset and works as a vaddr only when the first
719+
// PT_LOAD has p_vaddr==0 (true for standard PIE, but PT_PHDR is portable).
720+
out->phdr = base + ehdr->e_phoff;
667721
const Elf64_Phdr *pt_phdr = find_phdr(phdrs, ehdr->e_phnum, PT_PHDR);
668722
if (pt_phdr) out->phdr = base + pt_phdr->p_vaddr;
669723
return 0;
@@ -708,47 +762,8 @@ static int elf_load(const char *path, struct loaded_lib *lib, long page_size) {
708762
lib->base = base;
709763
lib->entry = base + ehdr.e_entry;
710764

711-
for (int i = 0; i < ehdr.e_phnum; i++) {
712-
if (phdrs[i].p_type != PT_LOAD) continue;
713-
714-
uintptr_t seg_start = BS_PAGE_DOWN(phdrs[i].p_vaddr, page_size);
715-
uintptr_t seg_file_end = phdrs[i].p_vaddr + phdrs[i].p_filesz;
716-
uintptr_t seg_mem_end = phdrs[i].p_vaddr + phdrs[i].p_memsz;
717-
// ELF spec (gABI): p_vaddr ≡ p_offset (mod p_align). For mmap to place
718-
// file bytes at the right vaddr, the page-aligned offset must equal the
719-
// page-rounded-down vaddr offset; this holds when p_align >= page_size.
720-
// A 4K-aligned ELF on a 64K-page kernel would violate this; ld.so and
721-
// ddtrace.so are built for the same page size so this never occurs here.
722-
long file_offset = (long)BS_PAGE_DOWN(phdrs[i].p_offset, page_size);
723-
724-
long map_len = (long)BS_PAGE_UP(seg_file_end, page_size) - (long)seg_start;
725-
if (map_len > 0) {
726-
void *seg = sys_mmap((void *)(base + seg_start), map_len,
727-
elf_pf_to_prot(phdrs[i].p_flags),
728-
BS_MAP_PRIVATE | BS_MAP_FIXED, fd, file_offset);
729-
if (seg == BS_MAP_FAILED) { sys_close(fd); return -1; }
730-
}
731-
732-
uintptr_t file_page_end = BS_PAGE_UP(seg_file_end, page_size);
733-
uintptr_t mem_page_end = BS_PAGE_UP(seg_mem_end, page_size);
734-
if (mem_page_end > file_page_end) {
735-
void *bss = sys_mmap((void *)(base + file_page_end),
736-
(long)(mem_page_end - file_page_end),
737-
elf_pf_to_prot(phdrs[i].p_flags),
738-
BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS, -1, 0);
739-
if (bss == BS_MAP_FAILED) { sys_close(fd); return -1; }
740-
}
741-
742-
// Zero the region from seg_file_end to mem_page_end.
743-
// This covers two cases:
744-
// 1) [seg_file_end, seg_mem_end): actual BSS within file mapped page
745-
// 2) [seg_mem_end, mem_page_end): beyond memsz but within the page rounded
746-
// file mapping; may contain non-zero file content (eg ELF section
747-
// headers, debug links) that must be hidden from the process image.
748-
if (seg_file_end < mem_page_end && (phdrs[i].p_flags & PF_W)) {
749-
long zlen = (long)(mem_page_end - seg_file_end);
750-
if (zlen > 0) bs_memset((void *)(base + seg_file_end), 0, zlen);
751-
}
765+
if (elf_map_segments(fd, 0, phdrs, ehdr.e_phnum, base, page_size) < 0) {
766+
sys_close(fd); return -1;
752767
}
753768
sys_close(fd);
754769

libdatadog

0 commit comments

Comments
 (0)