Skip to content

Commit 93542e6

Browse files
authored
Optimizations (#4006)
* Use an arena for parser metadata * Use the parser arena for the constant pool * Speed up the constant hash function * Small optimization for parser_lex_magic_comment * Scan forward through inline whitespace to avoid writing to parser->current.end continuously * Fast-paths for ASCII-only identifiers * Avoid unnecessary zero-ing of memory * Pre-size arena to avoid unnecessary growth * Force the allocation to be inlined * Inline pm_node_list_append, pm_char_is_whitespace, and pm_char_is_inline_whitespace * Avoid redundant whitespace scanning in magic comment lexing * Potentially skip whitespace scanning for speed * Inline three more functions, and lower the hash threshold for locals * Lex simple integer values as we are lexing * Only dispatch to lex_optional_float_suffix when it is possible * Optimize constant pool hash for short strings * Include string in constant pool entry to avoid chasing pointer * SIMD/SWAR for strpbrk * Fix a bug where we removed the \r warning * Use a bloom filter to quickly reject local lookups * Cache strpbrk lookup tables * Fix up rebase errors * More correctly detect SIMD on MSVC * Ensure allocations to the constant pool are through the arena * Fix ASAN reading off end of strpbrk cache ruby/ruby@968b999 Co-Authored-By: Kevin Newton <kddnewton@gmail.com> * Do not use GCC-specific syntax for lookup tables ruby/ruby@5026acf Co-Authored-By: Kevin Newton <kddnewton@gmail.com> * Fix infinite loop in parser_lex_magic_comment ruby/ruby@ec3162c Co-Authored-By: Kevin Newton <kddnewton@gmail.com> * Fix C coverage by moving stuff slightly around --------- Co-authored-by: Earlopain <14981592+Earlopain@users.noreply.github.com>
2 parents 7c1927e + eab6f33 commit 93542e6

18 files changed

Lines changed: 1043 additions & 481 deletions

include/prism/defines.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,18 @@
9191
# define inline __inline
9292
#endif
9393

94+
/**
95+
* Force a function to be inlined at every call site. Use sparingly — only for
96+
* small, hot functions where the compiler's heuristics fail to inline.
97+
*/
98+
#if defined(_MSC_VER)
99+
# define PRISM_FORCE_INLINE __forceinline
100+
#elif defined(__GNUC__) || defined(__clang__)
101+
# define PRISM_FORCE_INLINE inline __attribute__((always_inline))
102+
#else
103+
# define PRISM_FORCE_INLINE inline
104+
#endif
105+
94106
/**
95107
* Old Visual Studio versions before 2015 do not implement sprintf, but instead
96108
* implement _snprintf. We standard that here.
@@ -264,6 +276,49 @@
264276
#define PRISM_UNLIKELY(x) (x)
265277
#endif
266278

279+
/**
280+
* Platform detection for SIMD / fast-path implementations. At most one of
281+
* these macros is defined, selecting the best available vectorization strategy.
282+
*/
283+
#if (defined(__aarch64__) && defined(__ARM_NEON)) || (defined(_MSC_VER) && defined(_M_ARM64))
284+
#define PRISM_HAS_NEON
285+
#elif (defined(__x86_64__) && defined(__SSSE3__)) || (defined(_MSC_VER) && defined(_M_X64))
286+
#define PRISM_HAS_SSSE3
287+
#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
288+
#define PRISM_HAS_SWAR
289+
#endif
290+
291+
/**
292+
* Count trailing zero bits in a 64-bit value. Used by SWAR identifier scanning
293+
* to find the first non-matching byte in a word.
294+
*
295+
* Precondition: v must be nonzero. The result is undefined when v == 0
296+
* (matching the behavior of __builtin_ctzll and _BitScanForward64).
297+
*/
298+
#if defined(__GNUC__) || defined(__clang__)
299+
#define pm_ctzll(v) ((unsigned) __builtin_ctzll(v))
300+
#elif defined(_MSC_VER)
301+
#include <intrin.h>
302+
static inline unsigned pm_ctzll(uint64_t v) {
303+
unsigned long index;
304+
_BitScanForward64(&index, v);
305+
return (unsigned) index;
306+
}
307+
#else
308+
static inline unsigned
309+
pm_ctzll(uint64_t v) {
310+
unsigned c = 0;
311+
v &= (uint64_t) (-(int64_t) v);
312+
if (v & 0x00000000FFFFFFFFULL) c += 0; else c += 32;
313+
if (v & 0x0000FFFF0000FFFFULL) c += 0; else c += 16;
314+
if (v & 0x00FF00FF00FF00FFULL) c += 0; else c += 8;
315+
if (v & 0x0F0F0F0F0F0F0F0FULL) c += 0; else c += 4;
316+
if (v & 0x3333333333333333ULL) c += 0; else c += 2;
317+
if (v & 0x5555555555555555ULL) c += 0; else c += 1;
318+
return c;
319+
}
320+
#endif
321+
267322
/**
268323
* We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
269324
* Use PRISM_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional.

include/prism/node.h

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,31 @@
1717
#define PM_NODE_LIST_FOREACH(list, index, node) \
1818
for (size_t index = 0; index < (list)->size && ((node) = (list)->nodes[index]); index++)
1919

20+
/**
21+
* Slow path for pm_node_list_append: grow the list and append the node.
22+
* Do not call directly — use pm_node_list_append instead.
23+
*
24+
* @param arena The arena to allocate from.
25+
* @param list The list to append to.
26+
* @param node The node to append.
27+
*/
28+
void pm_node_list_append_slow(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node);
29+
2030
/**
2131
* Append a new node onto the end of the node list.
2232
*
2333
* @param arena The arena to allocate from.
2434
* @param list The list to append to.
2535
* @param node The node to append.
2636
*/
27-
void pm_node_list_append(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node);
37+
static PRISM_FORCE_INLINE void
38+
pm_node_list_append(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) {
39+
if (list->size < list->capacity) {
40+
list->nodes[list->size++] = node;
41+
} else {
42+
pm_node_list_append_slow(arena, list, node);
43+
}
44+
}
2845

2946
/**
3047
* Prepend a new node onto the beginning of the node list.

include/prism/parser.h

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ typedef struct {
100100
pm_heredoc_indent_t indent;
101101
} pm_heredoc_lex_mode_t;
102102

103+
/**
104+
* The size of the breakpoints and strpbrk cache charset buffers. All
105+
* breakpoint arrays and the strpbrk cache charset must share this size so
106+
* that memcmp can safely compare the full buffer without overreading.
107+
*/
108+
#define PM_STRPBRK_CACHE_SIZE 16
109+
103110
/**
104111
* When lexing Ruby source, the lexer has a small amount of state to tell which
105112
* kind of token it is currently lexing. For example, when we find the start of
@@ -169,7 +176,7 @@ typedef struct pm_lex_mode {
169176
* This is the character set that should be used to delimit the
170177
* tokens within the list.
171178
*/
172-
uint8_t breakpoints[11];
179+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
173180
} list;
174181

175182
struct {
@@ -191,7 +198,7 @@ typedef struct pm_lex_mode {
191198
* This is the character set that should be used to delimit the
192199
* tokens within the regular expression.
193200
*/
194-
uint8_t breakpoints[7];
201+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
195202
} regexp;
196203

197204
struct {
@@ -224,7 +231,7 @@ typedef struct pm_lex_mode {
224231
* This is the character set that should be used to delimit the
225232
* tokens within the string.
226233
*/
227-
uint8_t breakpoints[7];
234+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
228235
} string;
229236

230237
struct {
@@ -556,6 +563,13 @@ typedef struct pm_locals {
556563
/** The capacity of the local variables set. */
557564
uint32_t capacity;
558565

566+
/**
567+
* A bloom filter over constant IDs stored in this set. Used to quickly
568+
* reject lookups for names that are definitely not present, avoiding the
569+
* cost of a linear scan or hash probe.
570+
*/
571+
uint32_t bloom;
572+
559573
/** The nullable allocated memory for the local variables in the set. */
560574
pm_local_t *locals;
561575
} pm_locals_t;
@@ -639,6 +653,9 @@ struct pm_parser {
639653
/** The arena used for all AST-lifetime allocations. Caller-owned. */
640654
pm_arena_t *arena;
641655

656+
/** The arena used for parser metadata (comments, diagnostics, etc.). */
657+
pm_arena_t metadata_arena;
658+
642659
/**
643660
* The next node identifier that will be assigned. This is a unique
644661
* identifier used to track nodes such that the syntax tree can be dropped
@@ -790,12 +807,26 @@ struct pm_parser {
790807
pm_line_offset_list_t line_offsets;
791808

792809
/**
793-
* We want to add a flag to integer nodes that indicates their base. We only
794-
* want to parse these once, but we don't have space on the token itself to
795-
* communicate this information. So we store it here and pass it through
796-
* when we find tokens that we need it for.
810+
* State communicated from the lexer to the parser for integer tokens.
797811
*/
798-
pm_node_flags_t integer_base;
812+
struct {
813+
/**
814+
* A flag indicating the base of the integer (binary, octal, decimal,
815+
* hexadecimal). Set during lexing and read during node creation.
816+
*/
817+
pm_node_flags_t base;
818+
819+
/**
820+
* When lexing a decimal integer that fits in a uint32_t, we compute
821+
* the value during lexing to avoid re-scanning the digits during
822+
* parsing. If lexed is true, this holds the result and
823+
* pm_integer_parse can be skipped.
824+
*/
825+
uint32_t value;
826+
827+
/** Whether value holds a valid pre-computed integer. */
828+
bool lexed;
829+
} integer;
799830

800831
/**
801832
* This string is used to pass information from the lexer to the parser. It
@@ -938,6 +969,27 @@ struct pm_parser {
938969
* toggled with a magic comment.
939970
*/
940971
bool warn_mismatched_indentation;
972+
973+
#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR)
974+
/**
975+
* Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding
976+
* the nibble-based tables on every call when the charset hasn't changed
977+
* (which is the common case during string/regex/list lexing).
978+
*/
979+
struct {
980+
/** The cached charset (null-terminated, NUL-padded). */
981+
uint8_t charset[PM_STRPBRK_CACHE_SIZE];
982+
983+
/** Nibble-based low lookup table for SIMD matching. */
984+
uint8_t low_lut[16];
985+
986+
/** Nibble-based high lookup table for SIMD matching. */
987+
uint8_t high_lut[16];
988+
989+
/** Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */
990+
uint64_t table[4];
991+
} strpbrk_cache;
992+
#endif
941993
};
942994

943995
#endif

include/prism/util/pm_arena.h

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,52 @@ typedef struct {
4444
size_t block_count;
4545
} pm_arena_t;
4646

47+
/**
48+
* Ensure the arena has at least `capacity` bytes available in its current
49+
* block, allocating a new block if necessary. This allows callers to
50+
* pre-size the arena to avoid repeated small block allocations.
51+
*
52+
* @param arena The arena to pre-size.
53+
* @param capacity The minimum number of bytes to ensure are available.
54+
*/
55+
void pm_arena_reserve(pm_arena_t *arena, size_t capacity);
56+
57+
/**
58+
* Slow path for pm_arena_alloc: allocate a new block and return a pointer to
59+
* the first `size` bytes. Do not call directly — use pm_arena_alloc instead.
60+
*
61+
* @param arena The arena to allocate from.
62+
* @param size The number of bytes to allocate.
63+
* @returns A pointer to the allocated memory.
64+
*/
65+
void * pm_arena_alloc_slow(pm_arena_t *arena, size_t size);
66+
4767
/**
4868
* Allocate memory from the arena. The returned memory is NOT zeroed. This
4969
* function is infallible — it aborts on allocation failure.
5070
*
71+
* The fast path (bump pointer within the current block) is inlined at each
72+
* call site. The slow path (new block allocation) is out-of-line.
73+
*
5174
* @param arena The arena to allocate from.
5275
* @param size The number of bytes to allocate.
5376
* @param alignment The required alignment (must be a power of 2).
5477
* @returns A pointer to the allocated memory.
5578
*/
56-
void * pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment);
79+
static PRISM_FORCE_INLINE void *
80+
pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment) {
81+
if (arena->current != NULL) {
82+
size_t used_aligned = (arena->current->used + alignment - 1) & ~(alignment - 1);
83+
size_t needed = used_aligned + size;
84+
85+
if (used_aligned >= arena->current->used && needed >= used_aligned && needed <= arena->current->capacity) {
86+
arena->current->used = needed;
87+
return arena->current->data + used_aligned;
88+
}
89+
}
90+
91+
return pm_arena_alloc_slow(arena, size);
92+
}
5793

5894
/**
5995
* Allocate zero-initialized memory from the arena. This function is infallible
@@ -64,7 +100,12 @@ void * pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment);
64100
* @param alignment The required alignment (must be a power of 2).
65101
* @returns A pointer to the allocated, zero-initialized memory.
66102
*/
67-
void * pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment);
103+
static inline void *
104+
pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment) {
105+
void *ptr = pm_arena_alloc(arena, size, alignment);
106+
memset(ptr, 0, size);
107+
return ptr;
108+
}
68109

69110
/**
70111
* Allocate memory from the arena and copy the given data into it. This is a
@@ -76,7 +117,12 @@ void * pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment);
76117
* @param alignment The required alignment (must be a power of 2).
77118
* @returns A pointer to the allocated copy.
78119
*/
79-
void * pm_arena_memdup(pm_arena_t *arena, const void *src, size_t size, size_t alignment);
120+
static inline void *
121+
pm_arena_memdup(pm_arena_t *arena, const void *src, size_t size, size_t alignment) {
122+
void *dst = pm_arena_alloc(arena, size, alignment);
123+
memcpy(dst, src, size);
124+
return dst;
125+
}
80126

81127
/**
82128
* Free all blocks in the arena. After this call, all pointers returned by

0 commit comments

Comments
 (0)