Skip to content

Commit 9f44cf7

Browse files
committed
fix presyms
1 parent f4558b6 commit 9f44cf7

3 files changed

Lines changed: 80 additions & 35 deletions

File tree

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,11 +227,22 @@ Three encoding strategies, choose at runtime:
227227

228228
```ruby
229229
CBOR.no_symbols # Default. Encode as plain string. No round-trip.
230-
CBOR.symbols_as_string # Tag 39 + string. RFC 8949 compatible. Round-trip.
231-
CBOR.symbols_as_uint32 # Tag 39 + uint32. mruby-only, fastest. Round-trip.
230+
CBOR.symbols_as_string # Tag 39 + string. RFC 8949 compatible. Round-trip.
231+
CBOR.symbols_as_uint32 # Tag 39 hybrid: presym -> uint, runtime sym -> string.
232232
```
233233

234-
Use `symbols_as_string` for cross-language interop. `symbols_as_uint32` is faster but only works between mruby builds with identical presym tables — never use it for data that crosses build boundaries.
234+
`symbols_as_string` always emits a string payload — use this for cross-language interop.
235+
236+
`symbols_as_uint32` picks per-symbol:
237+
238+
- **Presym** (compile-time, `sym <= MRB_PRESYM_MAX`) → uint payload (the sym ID directly). Two bytes total for IDs `< 24`. Cheapest path; correct only between mruby builds with identical presym tables.
239+
- **Runtime sym** (`"foo_#{i}".to_sym`, anything `> MRB_PRESYM_MAX`) → string payload. Always portable, regardless of build.
240+
241+
Decoders accept either payload under tag 39 in this mode and dispatch by type. Out-of-range presym IDs raise `RangeError`. The name is historical — the wire is no longer always uint32.
242+
243+
The hybrid means runtime symbols always work across mismatched builds; only compile-time presyms care about build identity. If you don't control both ends, use `symbols_as_string`.
244+
245+
Requires presym; on `MRB_NO_PRESYM` builds, `symbols_as_uint32` raises `NotImplementedError`.
235246

236247
---
237248

src/mrb_cbor.c

Lines changed: 62 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,30 @@ cbor_set_sym_strategy(mrb_state *mrb, uint8_t mode)
753753
mrb_iv_set(mrb, mrb_obj_value(mod), MRB_SYM(__sym_strat__), mrb_fixnum_value(mode));
754754
}
755755

756+
/* Tag 39 hybrid payload -> mrb_sym. String -> intern, uint -> cast.
757+
* Shared by canonical strategy 2 and the fast decoder. Presyms span
758+
* 1 .. MRB_PRESYM_MAX inclusive; runtime syms travel as strings. */
759+
static mrb_value
760+
sym_from_tag39_payload(mrb_state *mrb, mrb_value v)
761+
{
762+
if (mrb_string_p(v)) {
763+
return mrb_symbol_value(mrb_intern_str(mrb, v));
764+
}
765+
else if (likely(mrb_integer_p(v))) {
766+
mrb_int n = mrb_integer(v);
767+
if (likely(n >= 1 && n <= MRB_PRESYM_MAX)) {
768+
return mrb_symbol_value((mrb_sym)n);
769+
} else {
770+
mrb_raise(mrb, E_RANGE_ERROR, "tag 39 presym ID out of range");
771+
}
772+
}
773+
else {
774+
mrb_raise(mrb, E_TYPE_ERROR,
775+
"invalid payload for tag 39 (expected text string or uint)");
776+
}
777+
return mrb_nil_value(); /* unreachable */
778+
}
779+
756780
static mrb_value
757781
decode_symbol(mrb_state *mrb, Reader* r, mrb_value src, mrb_value sharedrefs)
758782
{
@@ -775,17 +799,7 @@ decode_symbol(mrb_state *mrb, Reader* r, mrb_value src, mrb_value sharedrefs)
775799
}
776800

777801
if (mrb_cmp(mrb, strategy, mrb_fixnum_value(2)) == 0) {
778-
if (likely(mrb_integer_p(v))) {
779-
mrb_int n = mrb_integer(v);
780-
if (likely(n >= 0 && n <= UINT32_MAX)) {
781-
return mrb_symbol_value((mrb_sym)n);
782-
} else {
783-
mrb_raise(mrb, E_RANGE_ERROR, "invalid symbol ID size for tag 39");
784-
}
785-
} else {
786-
mrb_raise(mrb, E_TYPE_ERROR,
787-
"invalid payload for tag 39 (expected uint32)");
788-
}
802+
return sym_from_tag39_payload(mrb, v);
789803
}
790804

791805
mrb_raise(mrb, E_RUNTIME_ERROR, "invalid symbol strategy");
@@ -1498,24 +1512,53 @@ emit_f64:;
14981512
#endif /* MRB_USE_FLOAT32 */
14991513
#endif /* MRB_NO_FLOAT */
15001514

1515+
/* Tag 39 hybrid body -- canonical string codec.
1516+
* presym -> uint payload (sym ID); runtime -> string payload. */
1517+
static void
1518+
encode_sym_hybrid(CborWriter *w, mrb_sym sym)
1519+
{
1520+
encode_len(w, 6, 39);
1521+
if (sym > MRB_PRESYM_MAX) {
1522+
encode_string(w, mrb_sym2str(w->mrb, sym));
1523+
} else {
1524+
encode_len(w, 0, sym);
1525+
}
1526+
}
1527+
1528+
/* Tag 39 hybrid body -- fast string codec.
1529+
* Always major CBOR_FAST_STRING_TYPE, no UTF-8 scan. */
1530+
static void
1531+
encode_sym_hybrid_fast(CborWriter *w, mrb_sym sym)
1532+
{
1533+
encode_len(w, 6, 39);
1534+
if (sym > MRB_PRESYM_MAX) {
1535+
mrb_value s = mrb_sym2str(w->mrb, sym);
1536+
mrb_int blen = RSTRING_LEN(s);
1537+
encode_len(w, CBOR_FAST_STRING_TYPE, (uint64_t)blen);
1538+
cbor_writer_write(w, (const uint8_t*)RSTRING_PTR(s), (size_t)blen);
1539+
} else {
1540+
encode_len(w, 0, sym);
1541+
}
1542+
}
1543+
15011544
static void
15021545
encode_sym(CborWriter *w, mrb_value obj)
15031546
{
15041547
mrb_state *mrb = w->mrb;
15051548
mrb_value mode = cbor_sym_strategy(mrb);
1549+
mrb_sym sym = mrb_symbol(obj);
15061550

15071551
if (mrb_cmp(mrb, mode, mrb_fixnum_value(0)) == 0) {
1508-
encode_string(w, mrb_sym2str(mrb, mrb_symbol(obj)));
1552+
encode_string(w, mrb_sym2str(mrb, sym));
15091553
return;
15101554
}
15111555
if (mrb_cmp(mrb, mode, mrb_fixnum_value(1)) == 0) {
15121556
encode_len(w, 6, 39);
1513-
encode_string(w, mrb_sym2str(mrb, mrb_symbol(obj)));
1557+
encode_string(w, mrb_sym2str(mrb, sym));
15141558
return;
15151559
}
15161560
if (mrb_cmp(mrb, mode, mrb_fixnum_value(2)) == 0) {
1517-
encode_len(w, 6, 39);
1518-
encode_len(w, 0, mrb_symbol(obj));
1561+
encode_sym_hybrid(w, sym);
15191562
return;
15201563
}
15211564
mrb_bug(mrb, "CBOR internal error: invalid symbol strategy mode");
@@ -1933,15 +1976,9 @@ encode_value_fast(CborWriter *w, mrb_value obj)
19331976
}
19341977
case MRB_TT_ARRAY: encode_array_fast(w, obj); break;
19351978
case MRB_TT_HASH: encode_map_fast(w, obj); break;
1936-
case MRB_TT_SYMBOL: {
1937-
/* always tag 39 + string — no strategy config, same build both ends */
1938-
encode_len(w, 6, 39);
1939-
mrb_value s = mrb_sym2str(mrb, mrb_symbol(obj));
1940-
mrb_int blen = RSTRING_LEN(s);
1941-
encode_len(w, CBOR_FAST_STRING_TYPE, (uint64_t)blen);
1942-
cbor_writer_write(w, (const uint8_t*)RSTRING_PTR(s), (size_t)blen);
1979+
case MRB_TT_SYMBOL:
1980+
encode_sym_hybrid_fast(w, mrb_symbol(obj));
19431981
break;
1944-
}
19451982
case MRB_TT_CLASS:
19461983
case MRB_TT_MODULE: {
19471984
mrb_value name = mrb_class_path(mrb, mrb_class_ptr(obj));
@@ -2151,13 +2188,9 @@ decode_value_fast(mrb_state *mrb, Reader *r, mrb_value src)
21512188
mrb_value tag = read_cbor_uint(mrb, r, info);
21522189

21532190
if (mrb_cmp(mrb, tag, mrb_fixnum_value(39)) == 0) {
2154-
/* symbol — always encoded as tag 39 + string in fast path */
2191+
/* symbol -- tag 39 + string (runtime) or uint (presym ID) */
21552192
mrb_value v = decode_value_fast(mrb, r, src);
2156-
if (likely(mrb_string_p(v))) {
2157-
result = mrb_symbol_value(mrb_intern_str(mrb, v));
2158-
} else {
2159-
mrb_raise(mrb, E_TYPE_ERROR, "fast: tag 39 payload must be string");
2160-
}
2193+
result = sym_from_tag39_payload(mrb, v);
21612194
}
21622195
else if (mrb_cmp(mrb, tag, mrb_convert_uint32(mrb, CBOR_TAG_CLASS)) == 0) {
21632196
/* class/module — tag 49999 + string */

test/test.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,6 @@ def read(n)
454454
end
455455

456456
assert('tag 39: wrong payload type raises TypeError (both strategies)') do
457-
CBOR.symbols_as_uint32
458-
assert_raise(TypeError) { CBOR.decode("\xD8\x27\x65hello") } # string payload under uint32 strat
459457
CBOR.symbols_as_string
460458
assert_raise(TypeError) { CBOR.decode("\xD8\x27\x18\x2A") } # int payload under string strat
461459
CBOR.no_symbols
@@ -1070,10 +1068,13 @@ def initialize(v = 0); @val = v; end
10701068
].each { |v| assert_equal v, CBOR.decode_fast(CBOR.encode_fast(v)) }
10711069
end
10721070

1073-
assert('fast: symbols always encode as tag 39 + string regardless of mode') do
1071+
assert('fast: symbols roundtrip via tag 39 hybrid (presym uint / runtime string) regardless of mode') do
10741072
CBOR.no_symbols # mode doesn't matter for fast path
10751073
assert_equal :hello, CBOR.decode_fast(CBOR.encode_fast(:hello))
10761074
assert_equal({ hello: 1, world: 2 }, CBOR.decode_fast(CBOR.encode_fast({hello: 1, world: 2})))
1075+
# Runtime sym (above MRB_PRESYM_MAX) takes the string branch
1076+
runtime = "fast_runtime_#{rand(1 << 20)}".to_sym
1077+
assert_equal runtime, CBOR.decode_fast(CBOR.encode_fast(runtime))
10771078
end
10781079

10791080
assert('fast: class and module roundtrip via tag 49999') do

0 commit comments

Comments
 (0)