@@ -93,8 +93,10 @@ static void parse_stack(void *stack_top, struct boot_args *args);
9393static const char * find_env (char * * envp , const char * name );
9494static Elf64_auxv_t * find_auxv_entry (Elf64_auxv_t * auxv , unsigned long type );
9595static unsigned long get_auxv (Elf64_auxv_t * auxv , unsigned long type );
96- static int elf_load_from_memory (const void * src , size_t src_len ,
97- struct trampoline_map * out , long page_size );
96+ static int elf_map_segments (int fd , long file_bias , const Elf64_Phdr * phdrs ,
97+ int phnum , uintptr_t base , long page_size );
98+ static int elf_load_trampoline (const void * src , size_t src_len ,
99+ struct trampoline_map * out , long page_size );
98100static int elf_load (const char * path , struct loaded_lib * lib , long page_size );
99101static int create_patched_memfd (void );
100102static int uint_to_dec (unsigned int v , char * buf );
@@ -117,6 +119,7 @@ static long sys_write(int fd, const void *buf, long count);
117119static int sys_close (int fd );
118120static void * sys_mmap (void * addr , long length , int prot , int flags , int fd , long offset );
119121static int sys_munmap (void * addr , long length );
122+
120123static noreturn void sys_exit_group (int status );
121124static int sys_memfd_create (const char * name , unsigned int flags );
122125static long sys_sendfile (int out_fd , int in_fd , long count );
@@ -243,7 +246,7 @@ noreturn void _dd_solib_bootstrap(void *stack_top) {
243246 bs_fatal ("TRAMPOLINE_BIN not available" , 120 );
244247
245248 struct trampoline_map tmap ;
246- if (elf_load_from_memory (trampoline_bytes , trampoline_len , & tmap , page_size ) < 0 )
249+ if (elf_load_trampoline (trampoline_bytes , trampoline_len , & tmap , page_size ) < 0 )
247250 bs_fatal ("failed to map trampoline" , 121 );
248251
249252 // Step 2: Redirect auxv to the trampoline
@@ -609,61 +612,112 @@ static int elf_reserve(const Elf64_Phdr *phdrs, int phnum, long page_size,
609612}
610613
611614// }}}
612- // ---- Load ELF from memory (for trampoline embedded in ddtrace.so) {{{
615+ // ---- Common PT_LOAD segment mapper {{{
616+ //
617+ // Maps all PT_LOAD segments from an open fd into a pre-reserved address space.
618+ // `file_bias` is added to each segment's page-aligned file offset:
619+ // * pass 0 for a standalone file (e.g. ld.so loaded from its own path)
620+ // * pass (DD_TRAMPOLINE_BIN.ptr - &__ehdr_start) for the trampoline embedded
621+ // in /proc/self/exe
622+ //
623+ // Does NOT close fd on failure; caller is responsible.
624+ //
625+
626+ static int elf_map_segments (int fd , long file_bias , const Elf64_Phdr * phdrs ,
627+ int phnum , uintptr_t base , long page_size ) {
628+ if (file_bias != BS_PAGE_DOWN (file_bias , page_size )) {
629+ bs_fatal ("file_bias not page-aligned" , 123 );
630+ __builtin_unreachable ();
631+ }
632+
633+ for (int i = 0 ; i < phnum ; i ++ ) {
634+ if (phdrs [i ].p_type != PT_LOAD ) continue ;
635+
636+ uintptr_t seg_start = BS_PAGE_DOWN (phdrs [i ].p_vaddr , page_size );
637+ uintptr_t seg_file_end = phdrs [i ].p_vaddr + phdrs [i ].p_filesz ;
638+ uintptr_t seg_mem_end = phdrs [i ].p_vaddr + phdrs [i ].p_memsz ;
639+ uintptr_t file_page_end = BS_PAGE_UP (seg_file_end , page_size );
640+ uintptr_t mem_page_end = BS_PAGE_UP (seg_mem_end , page_size );
641+ int prot = elf_pf_to_prot (phdrs [i ].p_flags );
642+
643+ if (phdrs [i ].p_filesz > 0 ) {
644+ // ELF spec (gABI): p_vaddr ≡ p_offset (mod p_align), so
645+ // PAGE_DOWN(p_offset) places p_vaddr at the correct address.
646+ long file_offset = file_bias + (long )BS_PAGE_DOWN (phdrs [i ].p_offset , page_size );
647+ long file_map_len = (long )(file_page_end - seg_start );
648+ void * seg = sys_mmap ((void * )(base + seg_start ), file_map_len ,
649+ prot , BS_MAP_PRIVATE | BS_MAP_FIXED , fd , file_offset );
650+ if (seg == BS_MAP_FAILED ) return -1 ;
651+
652+ // Zero tail within the last file-backed page (writable only).
653+ // Both glibc and musl do this dlopen. One can't trus the linkers to
654+ // have the zeros in the file.
655+ if (seg_mem_end > seg_file_end && (phdrs [i ].p_flags & PF_W ))
656+ bs_memset ((void * )(base + seg_file_end ), 0 ,
657+ (long )(file_page_end - seg_file_end ));
658+ }
659+
660+ // Anonymous pages for BSS. For pure-BSS segments (p_filesz==0) start
661+ // at seg_start; for mixed segments start after the last file-backed page.
662+ uintptr_t anon_start = (phdrs [i ].p_filesz > 0 ) ? file_page_end : seg_start ;
663+ if (mem_page_end > anon_start ) {
664+ void * bss = sys_mmap ((void * )(base + anon_start ),
665+ (long )(mem_page_end - anon_start ),
666+ prot ,
667+ BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS ,
668+ -1 , 0 );
669+ if (bss == BS_MAP_FAILED ) return -1 ;
670+ }
671+ }
672+ return 0 ;
673+ }
674+
675+ // }}}
676+ // ---- Load trampoline ELF from /proc/self/exe {{{
613677//
614- // Like elf_load() but reads from an in-memory byte array instead of a file.
615- // Maps each PT_LOAD segment into anonymous memory and copies bytes from src.
678+ // The trampoline binary is embedded in ddtrace.so at DD_TRAMPOLINE_BIN.ptr.
679+ // Since ddtrace.so is /proc/self/exe when executed directly, we open that file
680+ // and mmap each PT_LOAD segment directly from it with the correct permissions.
616681
617- static int elf_load_from_memory (const void * src , size_t src_len ,
618- struct trampoline_map * out , long page_size ) {
682+ static int elf_load_trampoline (const void * src , size_t src_len ,
683+ struct trampoline_map * out , long page_size ) {
619684 bs_memset (out , 0 , sizeof (* out ));
620685
621686 if (src_len < sizeof (Elf64_Ehdr )) return -1 ;
622687 const Elf64_Ehdr * ehdr = (const Elf64_Ehdr * )src ;
623688
624689 if (elf_check_header (ehdr , 1u << ET_DYN , 32 ) < 0 ) return -1 ;
625- // Bounds-check the phdr table within the source buffer
626690 if (ehdr -> e_phoff + (uint64_t )ehdr -> e_phnum * sizeof (Elf64_Phdr ) > src_len ) return -1 ;
627691
628692 const Elf64_Phdr * phdrs = (const Elf64_Phdr * )((const char * )src + ehdr -> e_phoff );
629693
630- uintptr_t base ; long total ;
631- if (elf_reserve (phdrs , ehdr -> e_phnum , page_size , & base , & total ) < 0 ) return -1 ;
694+ // Compute the file offset of the trampoline ELF within /proc/self/exe.
695+ // __ehdr_start is the runtime load address of ddtrace.so's own ELF header.
696+ uintptr_t tramp_file_bias = (uintptr_t )src - (uintptr_t )& __ehdr_start ;
697+ if (tramp_file_bias & ((uintptr_t )page_size - 1 ))
698+ return -1 ; // DD_TRAMPOLINE_BIN not page-aligned within ddtrace.so
632699
633- // Map each PT_LOAD segment into anonymous memory and copy bytes from src.
634- // ET_EXEC binaries would fail here since we mmap'd at an arbitrary address.
635- // Segments are left writable; ld.so will re-map from file with correct
636- // perms.
637- for (int i = 0 ; i < ehdr -> e_phnum ; i ++ ) {
638- if (phdrs [i ].p_type != PT_LOAD ) continue ;
700+ int fd = sys_open_rdonly ("/proc/self/exe" );
701+ if (fd < 0 ) return -1 ;
639702
640- uintptr_t seg_vaddr = phdrs [i ].p_vaddr ;
641- uintptr_t seg_start = BS_PAGE_DOWN (seg_vaddr , page_size );
642- uintptr_t seg_mem_end = seg_vaddr + phdrs [i ].p_memsz ;
643- long map_len = (long )(BS_PAGE_UP (seg_mem_end , page_size ) - seg_start );
644- void * seg = sys_mmap ((void * )(base + seg_start ), map_len ,
645- elf_pf_to_prot (phdrs [i ].p_flags ) | BS_PROT_WRITE ,
646- BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS , -1 , 0 );
647- if (seg == BS_MAP_FAILED ) return -1 ;
648-
649- long file_bytes = (long )phdrs [i ].p_filesz ;
650- if ((long )(phdrs [i ].p_offset + phdrs [i ].p_filesz ) > (long )src_len )
651- file_bytes = (long )src_len - (long )phdrs [i ].p_offset ;
652- if (file_bytes > 0 )
653- bs_memcpy ((void * )(base + seg_vaddr ),
654- (const char * )src + phdrs [i ].p_offset , file_bytes );
655- // BSS (memsz > filesz) is already zero (MAP_ANONYMOUS)
703+ uintptr_t base ; long total ;
704+ if (elf_reserve (phdrs , ehdr -> e_phnum , page_size , & base , & total ) < 0 ) {
705+ sys_close (fd ); return -1 ;
706+ }
707+ if (elf_map_segments (fd , (long )tramp_file_bias , phdrs , ehdr -> e_phnum ,
708+ base , page_size ) < 0 ) {
709+ sys_close (fd ); return -1 ;
656710 }
711+ sys_close (fd );
657712
658713 out -> base = base ;
659714 out -> entry = base + ehdr -> e_entry ;
660715 out -> phnum = ehdr -> e_phnum ;
661716 out -> total_map = total ;
662- // Use PT_PHDR.p_vaddr for the phdr runtime address when PT_PHDR is present
663- // e_phoff is a file offset, not a vaddr, and adding it to `base` only
664- // works when p_vaddr of the first PT_LOAD is 0 (true for all standard PIE
665- // output, but PT_PHDR.p_vaddr is the correct portable source).
666- out -> phdr = base + ehdr -> e_phoff ; // fallback: works when first PT_LOAD p_vaddr==0
717+ // Use PT_PHDR.p_vaddr for the phdr runtime address when present;
718+ // e_phoff is a file offset and works as a vaddr only when the first
719+ // PT_LOAD has p_vaddr==0 (true for standard PIE, but PT_PHDR is portable).
720+ out -> phdr = base + ehdr -> e_phoff ;
667721 const Elf64_Phdr * pt_phdr = find_phdr (phdrs , ehdr -> e_phnum , PT_PHDR );
668722 if (pt_phdr ) out -> phdr = base + pt_phdr -> p_vaddr ;
669723 return 0 ;
@@ -708,47 +762,8 @@ static int elf_load(const char *path, struct loaded_lib *lib, long page_size) {
708762 lib -> base = base ;
709763 lib -> entry = base + ehdr .e_entry ;
710764
711- for (int i = 0 ; i < ehdr .e_phnum ; i ++ ) {
712- if (phdrs [i ].p_type != PT_LOAD ) continue ;
713-
714- uintptr_t seg_start = BS_PAGE_DOWN (phdrs [i ].p_vaddr , page_size );
715- uintptr_t seg_file_end = phdrs [i ].p_vaddr + phdrs [i ].p_filesz ;
716- uintptr_t seg_mem_end = phdrs [i ].p_vaddr + phdrs [i ].p_memsz ;
717- // ELF spec (gABI): p_vaddr ≡ p_offset (mod p_align). For mmap to place
718- // file bytes at the right vaddr, the page-aligned offset must equal the
719- // page-rounded-down vaddr offset; this holds when p_align >= page_size.
720- // A 4K-aligned ELF on a 64K-page kernel would violate this; ld.so and
721- // ddtrace.so are built for the same page size so this never occurs here.
722- long file_offset = (long )BS_PAGE_DOWN (phdrs [i ].p_offset , page_size );
723-
724- long map_len = (long )BS_PAGE_UP (seg_file_end , page_size ) - (long )seg_start ;
725- if (map_len > 0 ) {
726- void * seg = sys_mmap ((void * )(base + seg_start ), map_len ,
727- elf_pf_to_prot (phdrs [i ].p_flags ),
728- BS_MAP_PRIVATE | BS_MAP_FIXED , fd , file_offset );
729- if (seg == BS_MAP_FAILED ) { sys_close (fd ); return -1 ; }
730- }
731-
732- uintptr_t file_page_end = BS_PAGE_UP (seg_file_end , page_size );
733- uintptr_t mem_page_end = BS_PAGE_UP (seg_mem_end , page_size );
734- if (mem_page_end > file_page_end ) {
735- void * bss = sys_mmap ((void * )(base + file_page_end ),
736- (long )(mem_page_end - file_page_end ),
737- elf_pf_to_prot (phdrs [i ].p_flags ),
738- BS_MAP_PRIVATE | BS_MAP_FIXED | BS_MAP_ANONYMOUS , -1 , 0 );
739- if (bss == BS_MAP_FAILED ) { sys_close (fd ); return -1 ; }
740- }
741-
742- // Zero the region from seg_file_end to mem_page_end.
743- // This covers two cases:
744- // 1) [seg_file_end, seg_mem_end): actual BSS within file mapped page
745- // 2) [seg_mem_end, mem_page_end): beyond memsz but within the page rounded
746- // file mapping; may contain non-zero file content (eg ELF section
747- // headers, debug links) that must be hidden from the process image.
748- if (seg_file_end < mem_page_end && (phdrs [i ].p_flags & PF_W )) {
749- long zlen = (long )(mem_page_end - seg_file_end );
750- if (zlen > 0 ) bs_memset ((void * )(base + seg_file_end ), 0 , zlen );
751- }
765+ if (elf_map_segments (fd , 0 , phdrs , ehdr .e_phnum , base , page_size ) < 0 ) {
766+ sys_close (fd ); return -1 ;
752767 }
753768 sys_close (fd );
754769
0 commit comments