diff --git a/benchmark/desk/gen/bench-unum-grid.hoon b/benchmark/desk/gen/bench-unum-grid.hoon new file mode 100644 index 0000000..ce8dc6f --- /dev/null +++ b/benchmark/desk/gen/bench-unum-grid.hoon @@ -0,0 +1,30 @@ +:: bench-unum-grid: run the whole /lib/unum timing grid in one dojo call. +:: +bench-unum-grid n=@ud +:: For each (door, arm) it ~&-prints a [%cell door arm] label, then +cell's +:: ~>(%bout) slogs "took ..". Per-call = (arm took - base took) / n. Also +:: runs +fdp-cell per door (per-element = took / n). Returns the folded +:: results to force evaluation. Run once on the jetted build (hints on) and +:: once interpreted (hints commented) and diff the scraped times. +:: +/+ uc=unum-cells +:- %say +|= [* [n=@ud ~] ~] +:- %noun +=/ arms=(list @tas) + :~ %base %add %sub %mul %div %fma %sqt %neg + %exp %log %sin %cos %atan %pow %lth + == +=/ doors=(list @tas) ~[%rpb %rph %rps] +=/ rows + %+ turn doors + |= door=@tas + %+ turn arms + |= arm=@tas + ~& [%cell door arm] + (cell:uc door arm n) +=/ fdps + %+ turn doors + |= door=@tas + ~& [%fdp door] + (fdp-cell:uc door n) +[rows fdps] diff --git a/benchmark/desk/gen/bench-unum.hoon b/benchmark/desk/gen/bench-unum.hoon new file mode 100644 index 0000000..bd8669b --- /dev/null +++ b/benchmark/desk/gen/bench-unum.hoon @@ -0,0 +1,9 @@ +:: bench-unum: run ONE /lib/unum timing cell. +bench-unum [door arm n] +:: door ?(%rpb %rph %rps) ; arm @tas (or %base) ; n @ud +:: +cell's ~>(%bout) prints elapsed; per-call = (cell(arm) - cell(%base)) / n. +:: +/+ uc=unum-cells +:- %say +|= [* [door=?(%rpb %rph %rps) arm=@tas n=@ud ~] ~] +:- %noun +(cell:uc door arm n) diff --git a/benchmark/desk/lib/bench-core.hoon b/benchmark/desk/lib/bench-core.hoon new file mode 100644 index 0000000..df49f1c --- /dev/null +++ b/benchmark/desk/lib/bench-core.hoon @@ -0,0 +1,31 @@ +:: bench-core: shared timing loop for the numerics benchmark suite. +:: +:: `time` runs a tight loop over a PRECOMPUTED list of inputs, wrapped in +:: ~>(%bout ...), which PRINTS the elapsed time ("took ms/..") as a slog and +:: RETURNS the folded accumulator. The host driver scrapes the printed line; +:: the returned value forces evaluation of every call (defeats dead-code +:: elimination). +:: +:: CRITICAL: inputs are precomputed by the caller OUTSIDE this gate, so the +:: slow interpreted @ud->@rX conversions (sun:rd / san:rd, ~93 us/call) are +:: NOT charged to per-call cost. Inside the timed loop the only work is the +:: arm under test plus a jetted atom-add fold -- both the input list walk +:: (O(1) head access) and the fold are cheap, so the measured time reflects +:: the arm, isolated. Each input is a [x y] pair; y is unused (0) for the +:: single-argument arms and carries the second operand for atan2/pow/pow-n. +:: +:: Per-call cost = (time(arm-list) - time(base-list)) / n, where the base list +:: uses the same inputs but the step skips the transcendental. +:: +|% +:: +time: walk the precomputed input list, timed by %bout; return the acc. +:: +++ time + |= [xs=(list [x=@ y=@]) step=$-([[x=@ y=@] acc=@] @)] + ^- @ + ~> %bout + =/ acc=@ `@`0 + |- ^- @ + ?~ xs acc + $(xs t.xs, acc (step i.xs acc)) +-- diff --git a/benchmark/desk/lib/unum-cells.hoon b/benchmark/desk/lib/unum-cells.hoon new file mode 100644 index 0000000..155a73f --- /dev/null +++ b/benchmark/desk/lib/unum-cells.hoon @@ -0,0 +1,59 @@ +:: unum-cells: per-(door,arm) timing cell for /lib/unum (posit) benchmarks, +:: mirroring lib/bench-cells (the /lib/math harness). +cell precomputes the +:: posit input list (sun/div kept OUT of the hot path) and folds the arm over +:: it via (time ..), whose ~>(%bout) slogs "took ..". Per-call cost = +:: (cell(arm) - cell(%base)) / n. +fdp-cell times one fused dot product over +:: length-n vectors (per-element = took / n). +:: +:: The posit width is selected at runtime by %*-specializing the generic ++pp +:: core on bloq (3/4/5 = posit8/16/32); the unum jets read bloq from that +:: sample, so this one body benchmarks every width and both the jetted (hints +:: on) and interpreted (hints commented) builds. +:: +/+ *bench-core, unum +|% +:: +dor: door @tas -> bloq +++ dor |=(door=@tas ?:(=(door %rpb) 3 ?:(=(door %rph) 4 5))) +:: +cell: time `n` folds of `arm` at posit width `door`. +++ cell + |= [door=@tas arm=@tas n=@ud] + ^- @ + =/ d %*(. pp:unum bloq (dor door)) + :: inputs in [0.25, 1.25]: positive, in-range for exp/log/sqt and all ops. + =/ inp |=(k=@ud ^-(@ (div:d (sun:d +((mod k 5))) (sun:d 4)))) + =/ xs=(list [x=@ y=@]) + %+ turn (gulf 0 (dec n)) + |= k=@ud ^-([@ @] [(inp k) (inp +(k))]) + =/ step + |= [p=[x=@ y=@] acc=@] ^- @ + %+ add acc + ?+ arm ~|([%bad-arm arm] !!) + %base x.p + %neg (neg:d x.p) + %abs (abs:d x.p) + %sqt (sqt:d x.p) + %exp (exp:d x.p) + %log (log:d x.p) + %sin (sin:d x.p) + %cos (cos:d x.p) + %atan (atan:d x.p) + %add (add:d x.p y.p) + %sub (sub:d x.p y.p) + %mul (mul:d x.p y.p) + %div (div:d x.p y.p) + %fma (fma:d x.p y.p x.p) + %pow (pow:d x.p y.p) + %lth ?:((lth:d x.p y.p) 1 0) + == + (time xs step) +:: +fdp-cell: time one fused dot product over length-n posit vectors. +++ fdp-cell + |= [door=@tas n=@ud] + ^- @ + =/ d %*(. pp:unum bloq (dor door)) + =/ inp |=(k=@ud ^-(@ (div:d (sun:d +((mod k 5))) (sun:d 4)))) + =/ av=(list @) (turn (gulf 0 (dec n)) inp) + =/ bv=(list @) (turn (gulf 0 (dec n)) |=(k=@ud (inp +(k)))) + ~> %bout + (fdp:d av bv) +-- diff --git a/benchmark/results/2026-06-28/unum/README.md b/benchmark/results/2026-06-28/unum/README.md new file mode 100644 index 0000000..8386d29 --- /dev/null +++ b/benchmark/results/2026-06-28/unum/README.md @@ -0,0 +1,47 @@ +# `/lib/unum` (posit) jet benchmark — 2026-06-28 + +Three-way per-call comparison of the `/lib/unum` arms: **interpreted** Hoon vs +**Python/SoftUnum** (the C lib via ctypes) vs **jetted** Hoon, at posit8/16/32 +(`rpb`/`rph`/`rps`). Full table in [`table.txt`](table.txt). + +## Method (mirrors the `/lib/math` `bench-math` protocol) + +- `gen/bench-unum-grid` (→ `lib/unum-cells`) precomputes a posit input list + *outside* the timed loop, then folds each arm `n` times inside `~>(%bout)`, + which slogs `took …`. Per-call = `(arm − base) / n`. The `%bout` value with + the dots stripped is microseconds. +- **One jet binary, hint toggle.** Jetted = `/lib/unum` with its `~%`/`~/` + hints; interpreted = the same file with the hints commented out (so no jet + matches and the pure Hoon runs). Both on a hoon-135 fakezod (the 408k pill). +- Jetted measured at `n=100,000`; interpreted at `n=100` (interpreted posit + transcendentals are ~20–60 ms/call, so a larger `n` is impractical). +- Python column: `tools/bench_unum_report.py` times SoftUnum via ctypes + (200k calls/arm) — the raw C speed a Python user sees (call overhead included). + +## Headline numbers (jetted vs interpreted) + +| class | jetted | interpreted | speedup | +|---|---|---|---| +| arithmetic (`add`/`sub`/`mul`/`div`/`fma`) | ~1.5–2 µs | ~270–530 µs | **~180–295×** | +| `sqrt` posit8 | 1.4 µs | 338 µs | 238× | +| transcendentals (`exp`/`log`/`sin`/`cos`/`pow`) | ~12–60 µs | ~18–57 ms | **~760–2000×** | +| `fdp` (fused dot product, per element) | 0.09–0.27 µs | ~205 µs | **760–2300×** | + +The fused dot product shows the largest win: the jet runs the whole quire +accumulation in C (no per-element noun allocation), so posit8 `fdp` is ~0.09 +µs/element — ~2300× the interpreted Hoon. + +## Caveat: posit16/32 `sqrt`/`atan` are slow even jetted + +`sqt`/`atan` widen sharply at posit16/32 because SoftUnum's wide path uses a +512-bit fixed `wide_t` with a bit-by-bit integer sqrt; e.g. jetted `atan:rps` is +~850 µs/call (still 75× the interpreted Hoon, but far off the posit8 45 µs). +The 512-bit `isqt`/AGM is the bottleneck — a candidate for a faster wide sqrt if +posit32 transcendental throughput matters. + +## Files + +- `table.txt` — the full per-call table. +- `jetted-n100000.txt`, `interp-n100.txt` — raw scraped `%bout` grids. +- harness: `benchmark/desk/lib/unum-cells.hoon`, `benchmark/desk/gen/bench-unum{,-grid}.hoon`, + `benchmark/tools/bench_unum_report.py`. diff --git a/benchmark/results/2026-06-28/unum/interp-n100.txt b/benchmark/results/2026-06-28/unum/interp-n100.txt new file mode 100644 index 0000000..897c150 --- /dev/null +++ b/benchmark/results/2026-06-28/unum/interp-n100.txt @@ -0,0 +1,96 @@ +[%cell %rpb %base] +took µs/204 +[%cell %rpb %add] +took ms/37.571 +[%cell %rpb %sub] +took ms/33.376 +[%cell %rpb %mul] +took ms/27.582 +[%cell %rpb %div] +took ms/33.893 +[%cell %rpb %fma] +took ms/52.949 +[%cell %rpb %sqt] +took ms/34.048 +[%cell %rpb %neg] +took ms/2.191 +[%cell %rpb %exp] +took s/1.848.485 +[%cell %rpb %log] +took s/3.599.430 +[%cell %rpb %sin] +took s/3.023.956 +[%cell %rpb %cos] +took s/3.026.054 +[%cell %rpb %atan] +took s/6.166.146 +[%cell %rpb %pow] +took s/5.304.743 +[%cell %rpb %lth] +took ms/2.309 +[%cell %rph %base] +took µs/189 +[%cell %rph %add] +took ms/38.078 +[%cell %rph %sub] +took ms/33.866 +[%cell %rph %mul] +took ms/26.992 +[%cell %rph %div] +took ms/34.390 +[%cell %rph %fma] +took ms/49.550 +[%cell %rph %sqt] +took ms/33.723 +[%cell %rph %neg] +took ms/2.117 +[%cell %rph %exp] +took s/1.916.618 +[%cell %rph %log] +took s/3.740.115 +[%cell %rph %sin] +took s/3.081.869 +[%cell %rph %cos] +took s/3.109.865 +[%cell %rph %atan] +took s/6.351.491 +[%cell %rph %pow] +took s/5.535.635 +[%cell %rph %lth] +took ms/2.283 +[%cell %rps %base] +took µs/202 +[%cell %rps %add] +took ms/37.676 +[%cell %rps %sub] +took ms/34.177 +[%cell %rps %mul] +took ms/27.852 +[%cell %rps %div] +took ms/34.635 +[%cell %rps %fma] +took ms/49.469 +[%cell %rps %sqt] +took ms/34.359 +[%cell %rps %neg] +took ms/2.170 +[%cell %rps %exp] +took s/1.942.873 +[%cell %rps %log] +took s/3.906.881 +[%cell %rps %sin] +took s/3.229.494 +[%cell %rps %cos] +took s/3.248.253 +[%cell %rps %atan] +took s/6.371.033 +[%cell %rps %pow] +took s/5.696.497 +[%cell %rps %lth] +took ms/2.282 +[%fdp %rpb] +took ms/20.458 +[%fdp %rph] +took ms/20.457 +[%fdp %rps] +took ms/20.549 diff --git a/benchmark/results/2026-06-28/unum/jetted-n100000.txt b/benchmark/results/2026-06-28/unum/jetted-n100000.txt new file mode 100644 index 0000000..81e52c6 --- /dev/null +++ b/benchmark/results/2026-06-28/unum/jetted-n100000.txt @@ -0,0 +1,96 @@ +[%cell %rpb %base] +took ms/201.210 +[%cell %rpb %add] +took ms/347.461 +[%cell %rpb %sub] +took ms/350.724 +[%cell %rpb %mul] +took ms/353.591 +[%cell %rpb %div] +took ms/366.860 +[%cell %rpb %fma] +took ms/379.836 +[%cell %rpb %sqt] +took ms/343.556 +[%cell %rpb %neg] +took ms/269.387 +[%cell %rpb %exp] +took s/1.361.196 +[%cell %rpb %log] +took s/1.992.308 +[%cell %rpb %sin] +took s/1.734.641 +[%cell %rpb %cos] +took s/1.725.140 +[%cell %rpb %atan] +took s/4.711.162 +[%cell %rpb %pow] +took s/2.921.758 +[%cell %rpb %lth] +took ms/341.119 +[%cell %rph %base] +took ms/201.493 +[%cell %rph %add] +took ms/378.660 +[%cell %rph %sub] +took ms/380.545 +[%cell %rph %mul] +took ms/368.763 +[%cell %rph %div] +took ms/360.478 +[%cell %rph %fma] +took ms/398.496 +[%cell %rph %sqt] +took s/1.140.286 +[%cell %rph %neg] +took ms/275.133 +[%cell %rph %exp] +took s/2.730.851 +[%cell %rph %log] +took s/4.091.147 +[%cell %rph %sin] +took s/3.613.529 +[%cell %rph %cos] +took s/3.703.221 +[%cell %rph %atan] +took s/46.093.061 +[%cell %rph %pow] +took s/6.134.767 +[%cell %rph %lth] +took ms/341.385 +[%cell %rps %base] +took ms/213.612 +[%cell %rps %add] +took ms/382.517 +[%cell %rps %sub] +took ms/389.338 +[%cell %rps %mul] +took ms/375.795 +[%cell %rps %div] +took ms/357.904 +[%cell %rps %fma] +took ms/403.311 +[%cell %rps %sqt] +took s/1.944.009 +[%cell %rps %neg] +took ms/287.158 +[%cell %rps %exp] +took s/2.745.098 +[%cell %rps %log] +took s/4.200.254 +[%cell %rps %sin] +took s/3.688.782 +[%cell %rps %cos] +took s/3.781.378 +[%cell %rps %atan] +took s/85.240.487 +[%cell %rps %pow] +took s/6.279.328 +[%cell %rps %lth] +took ms/341.757 +[%fdp %rpb] +took ms/8.910 +[%fdp %rph] +took ms/26.396 +[%fdp %rps] +took ms/26.986 diff --git a/benchmark/results/2026-06-28/unum/table.txt b/benchmark/results/2026-06-28/unum/table.txt new file mode 100644 index 0000000..e7b746d --- /dev/null +++ b/benchmark/results/2026-06-28/unum/table.txt @@ -0,0 +1,54 @@ + +/lib/unum jet benchmark (jetted n=100,000; interpreted n=100) +per-call microseconds; speedup = interpreted / jetted + +width arm interp µs jetted µs py/SU µs jet vs interp +---------------------------------------------------------- +rpb add 373.67 1.463 0.179 255x +rpb sub 331.72 1.495 0.179 222x +rpb mul 273.78 1.524 0.183 180x +rpb div 336.89 1.657 0.185 203x +rpb fma 527.45 1.786 0.239 295x +rpb sqt 338.44 1.423 0.155 238x +rpb neg 19.87 0.682 0.112 29x +rpb lth 21.05 1.399 - 15x +rpb exp 18482.81 11.600 1.237 1593x +rpb log 35992.26 17.911 2.057 2010x +rpb sin 30237.52 15.334 1.685 1972x +rpb cos 30258.50 15.239 1.716 1986x +rpb atan 61659.42 45.100 4.884 1367x +rpb pow 53045.39 27.205 3.318 1950x +rpb fdp 204.58 0.089 - 2296x + +rph add 378.89 1.772 0.239 214x +rph sub 336.77 1.791 0.229 188x +rph mul 268.03 1.673 0.220 160x +rph div 342.01 1.590 0.227 215x +rph fma 493.61 1.970 0.296 251x +rph sqt 335.34 9.388 0.377 36x +rph neg 19.28 0.736 0.116 26x +rph lth 20.94 1.399 - 15x +rph exp 19164.29 25.294 3.375 758x +rph log 37399.26 38.897 5.503 962x +rph sin 30816.80 34.120 4.646 903x +rph cos 31096.76 35.017 4.670 888x +rph atan 63513.02 458.916 19.593 138x +rph pow 55354.46 59.333 8.728 933x +rph fdp 204.57 0.264 - 775x + +rps add 374.74 1.689 0.234 222x +rps sub 339.75 1.757 0.229 193x +rps mul 276.50 1.622 0.220 170x +rps div 344.33 1.443 0.212 239x +rps fma 492.67 1.897 0.304 260x +rps sqt 341.57 17.304 0.563 20x +rps neg 19.68 0.735 0.119 27x +rps lth 20.80 1.281 - 16x +rps exp 19426.71 25.315 3.461 767x +rps log 39066.79 39.866 6.271 980x +rps sin 32292.92 34.752 4.828 929x +rps cos 32480.51 35.678 4.918 910x +rps atan 63708.31 850.269 27.035 75x +rps pow 56962.95 60.657 9.568 939x +rps fdp 205.49 0.270 - 761x + diff --git a/benchmark/tools/bench_unum_report.py b/benchmark/tools/bench_unum_report.py new file mode 100644 index 0000000..d775fef --- /dev/null +++ b/benchmark/tools/bench_unum_report.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +"""Assemble the /lib/unum benchmark table: interpreted Hoon vs Python/SoftUnum +vs jetted Hoon, per (posit width, arm). + +Inputs: two scraped %bout grids (jetted at n=JN, interpreted at n=IN). The +%bout line is `took /`; stripping the dots yields the time in +MICROSECONDS (the unit label is just magnitude). Per-call = (arm - base) / n. +The Python column times SoftUnum (the C lib the jet calls) via ctypes, which is +the speed a Python user gets (call overhead included). + + /opt/anaconda3/bin/python tools/bench_unum_report.py JET.txt INTERP.txt JN IN +""" +import ctypes, os, re, sys, time + +jet_f, int_f = sys.argv[1], sys.argv[2] +JN, IN = int(sys.argv[3]), int(sys.argv[4]) + +def parse(path): + # -> {(door,arm): microseconds}, plus ('fdp',door) + out, label = {}, None + for ln in open(path): + m = re.search(r'\[%cell %(\w+) %([\w-]+)\]', ln) + f = re.search(r'\[%fdp %(\w+)\]', ln) + t = re.search(r'took\s+\S+?/([\d.]+)', ln) + if m: label = (m.group(1), m.group(2)) + elif f: label = ('fdp', f.group(1)) + elif t and label: + out[label] = int(t.group(1).replace('.', '')) # dots -> µs + label = None + return out + +jet, itp = parse(jet_f), parse(int_f) +DOORS = ['rpb', 'rph', 'rps'] +ARMS = ['add','sub','mul','div','fma','sqt','neg','lth', + 'exp','log','sin','cos','atan','pow'] + +# --- Python/SoftUnum (ctypes) per-call --- +HERE = os.path.dirname(os.path.abspath(__file__)) +LIB = os.path.join(HERE, '..', '..', 'src', 'SoftPosit') # placeholder +LIB = os.path.expanduser('~/urbit/SoftUnum/libsoftunum.dylib') +L = ctypes.CDLL(LIB) +u32 = ctypes.c_uint32 +PFX = {'rpb': 'p8', 'rph': 'p16', 'rps': 'p32'} +ONE = {'rpb': 0x40, 'rph': 0x4000, 'rps': 0x40000000} +HALF = {'rpb': 0x38, 'rph': 0x3800, 'rps': 0x38000000} +UN = {'sqt','neg','exp','log','sin','cos','atan'} +BIN = {'add','sub','mul','div','pow','lth'} + +def bind(pfx, arm): + nm = {'sqt':'sqrt'}.get(arm, arm) + f = getattr(L, f'{pfx}_{nm}') + n = 1 if arm in UN else (3 if arm=='fma' else 2) + f.argtypes = [u32]*n; f.restype = ctypes.c_int if arm=='lth' else u32 + return f + +def py_us(door, arm): + pfx = PFX[door]; f = bind(pfx, arm); x = HALF[door]; y = ONE[door] + args = (x,) if arm in UN else ((x,y,y) if arm=='fma' else (x,y)) + M = 200_000 + for _ in range(2000): f(*args) + t0 = time.perf_counter() + for _ in range(M): f(*args) + return (time.perf_counter()-t0)/M*1e6 + +def percall(d, door, arm, n): + base = d.get((door,'base')); v = d.get((door,arm)) + if base is None or v is None: return None + return max(v-base, 0)/n + +print(f"\n/lib/unum jet benchmark (jetted n={JN:,}; interpreted n={IN:,})") +print("per-call microseconds; speedup = interpreted / jetted\n") +hdr = f"{'width':5} {'arm':5} {'interp µs':>11} {'jetted µs':>10} {'py/SU µs':>9} {'jet vs interp':>13}" +print(hdr); print('-'*len(hdr)) +for door in DOORS: + for arm in ARMS: + ji = percall(jet, door, arm, JN); ii = percall(itp, door, arm, IN) + try: pu = py_us(door, arm) + except Exception: pu = None + if ji is None or ii is None: continue + sp = (ii/ji) if ji else float('inf') + print(f"{door:5} {arm:5} {ii:11.2f} {ji:10.3f} " + f"{(f'{pu:.3f}' if pu is not None else '-'):>9} {sp:12.0f}x") + # fdp: per-element = took / n (no base) + fj = jet.get(('fdp',door)); fi = itp.get(('fdp',door)) + if fj and fi: + fjp, fip = fj/JN, fi/IN + print(f"{door:5} {'fdp':5} {fip:11.2f} {fjp:10.3f} {'-':>9} {fip/fjp:12.0f}x") + print() diff --git a/libmath/NEXT-STEPS.md b/libmath/NEXT-STEPS.md index 09ad5a9..4f2545e 100644 --- a/libmath/NEXT-STEPS.md +++ b/libmath/NEXT-STEPS.md @@ -7,6 +7,72 @@ IEEE-754 conversion, the quire + fused dot product, and naive transcendentals, at widths posit8/16/32/64/128. This file tracks what is deliberately *not* yet done, roughly in dependency order. +## Roadmap — 2026-06-28 (post-jets) + +The jet effort (old §5) is **done**: **SoftUnum** (`sigilante/SoftUnum`, the +pure-integer C twin of `/lib/unum`, verified four ways) is vendored into vere +(`ext/softunum`) and the full `/lib/unum` surface (54 arms, posit8/16/32) is +jetted under `non/unum`, fired bit-exact on a hoon-135 fakezod, and benchmarked +(arithmetic ~180–295×, transcendentals ~760–2000×, `fdp` ~760–2300× over +interpreted). PRs: numerics #65 (hints + reference + valids appendix + +benchmarks); vere #1046 (draft, stacked on the math-jets PR #1044). + +Prioritized next steps: + +0. **Land the PRs.** numerics #65 → merge. vere #1046 is gated on #1044 + (math jets) and #1022 (lagoon SoftBLAS) merging; then re-target #1046 to + `develop` and mark ready. Registration is hoon-135 only (lowest kelvin). + +1. **Posit linear algebra — the Saloon/Lagoon payoff (highest value).** + `%unum` is already wired into Lagoon (PR #42) and Saloon eig (PR #58), so + Saloon-over-posits works and now inherits the *scalar* jet speedup (its + inner `mmul`/`dot`/`sqt`/`add` hit the jetted unum ops). The remaining win + is **array-level**: a SoftUnum-backed **posit GEMM/dot using the quire** + (the analog of SoftBLAS), vendored like SoftBLAS, with a Lagoon `%unum` + `dot`/`mmul` jet dispatching to it -- one exact-accumulated C call instead + of a Hoon loop over per-element `fdp`. This is the headline posit feature: + exact dot product -> matmul with no error accumulation. Saloon + decompositions inherit it for free. + +2. **Perf gaps the benchmark surfaced.** + - posit16/32 `sqt`/`atan` are slow even jetted (jetted `atan:rps` ~850 us): + the 512-bit `wide_t` **bit-by-bit `isqt`** in the AGM loop is the + bottleneck. Replace with a faster wide sqrt (Newton + wide divmod, or an + `__int128` seed then refine). + - posit16 over-uses the 512-bit `wide_t` (its quire is 256-bit, arithmetic + fits `__int128`); a tighter p16 path would shave the common arithmetic. + +3. **Coverage: posit64/128.** SoftUnum does 8/16/32; `rpd`/`rpq` jets return + `u3_none` (fall back to Hoon). The `wide_t` machinery already exists -- + extend SoftUnum to 64/128 (1024/2048-bit quire) to jet the last two widths. + No external oracle (cerlane `pX2` caps at 32); verify vs the from-scratch + reference + the Hoon. + +4. **Accuracy: range-reduced / quire-accumulated transcendentals** (old §2). + The naive Taylor `exp`/`log`/`sin` are accurate only near the expansion + point. Range-reduce (powers of 2 are exact for posits; trig by pi/2) and + run the series through the quire. Coordinated Hoon + SoftUnum change (both + stay bit-exact). + +5. **Standard-name alias layer** (old §1): the 2022-standard public names + (`addition`/`subtraction`/`sin-pi`/`compound`/`hypot`/`arctan2`/...) over + the implemented core; mostly renames + thin compositions. + +6. **Valids** (the Type-III interval class) -- see the appendix below. Lowest + priority: new surface, no SoftPosit oracle, loosely standardized. + +7. **Parallel Rust SoftUnum** (for NockApp; not urgent). C and Rust can't + share code, so "same behavior" is enforced by a language-neutral + test-vector corpus generated from the oracle that both reproduce. The C + repo reserves a top-level `rust/` seam. + +8. **Upstream SoftUnum -> `urbit/SoftUnum`** (like `urbit/SoftBLAS`) once it + stabilizes, and re-pin the vere `ext/softunum` tarball there. + +9. **Decimal printer / literal syntax** for the posit auras (`@rpb`..`@rpq`): + emit the §6.3 minimum significant digits. Optional local runtime patch, + decoupled from the library. + ## Verification & tooling - **Oracle**: SoftPosit (vendored C in `src/SoftPosit`, plus the `softposit` @@ -79,12 +145,15 @@ The interval class (`@rvb/@rvh/@rvs`). Entirely new surface: an interval is a pair of posit endpoints with open/closed tags; arithmetic is interval arithmetic. Lowest priority; design from Gustafson's Type-III definition. -## 5. Jets (separate, C/vere effort) +## 5. Jets (DONE -- see the 2026-06-28 roadmap above) -The library is pure Hoon by design. SoftPosit's C (`src/SoftPosit/source/*.c`: -`pX2_*`, `qX2_*`, conversions) is the reference implementation and the spec for a -future jet, vendored into vere like SoftBLAS. Only worth it once a consumer -(e.g. lagoon `%unum` matmul) needs the speed. +**Done.** Rather than vendor SoftPosit (whose `pX2` caps at 32 bits and has no +transcendentals), we built **SoftUnum** (`sigilante/SoftUnum`) -- the bit-exact +pure-integer C twin of this library -- vendored it into vere (`ext/softunum`), +and jetted the full surface (54 arms, posit8/16/32) under `non/unum`. Verified +firing bit-exact on a hoon-135 fakezod; benchmarked in `benchmark/results/`. +PRs: numerics #65, vere #1046 (draft on #1044). Follow-on perf/coverage work +(posit GEMM, wide `isqt`, posit64/128) is in the roadmap above. ## Known minor items @@ -98,3 +167,75 @@ future jet, vendored into vere like SoftBLAS. Only worth it once a consumer - `twoc.hoon` now has corrected `lth`/`lte`/`gte` and `overflow`; the only former consumer (`lagoon-old.hoon`) was deleted in PR #38. A real `%int2`-into-lagoon effort would be twoc's first live caller. + +--- + +# Appendix: Valids (Type-III interval unums) — assessment & open questions + +Status as of 2026-06-28. Valids are the third Type-III unum (posits · quires · +**valids**), the rigorous-interval class. `/lib/unum` does not implement them; +this appendix records the design assessment so the eventual effort starts from a +shared understanding rather than a blank page. Lowest priority (no consumer, no +oracle, loosely standardized) — slot it *after* the posit/quire jets. + +## What a valid is + +A valid is a *guaranteed enclosure* of a real quantity: two posit-like +**endpoints, each tagged with a "ubit" (uncertainty bit)**. ubit = 0 → the +endpoint is exact (closed); ubit = 1 → "the open interval between this posit and +its neighbour". So a valid encodes `[lo, hi]` / `(lo, hi)` / half-open — a value +*and* its uncertainty. Every operation returns the **tightest valid containing +all possible results** (Gustafson's "end of error": a provable bound, not a +single rounded answer). + +**The projective twist.** Valids live on the *projective* real line — the reals +closed into a ring with a single point at the top (the NaR/∞ point, shared with +the posit bit layout). This lets a valid represent **exterior / wrapping +intervals** ("x < −3 OR x > 5") and unbounded sets ("x > 5") by going the long +way round through infinity. Consequence: compare / union / intersection are +arithmetic on *arcs of a circle*, not segments of a line — genuinely different +semantics from classical `[lo,hi]` interval arithmetic. + +## What it would build on (and the one real new primitive) + +Endpoints are posits, so valid arithmetic reuses the SoftUnum / `/lib/unum` +posit core. The new building block is **directed rounding**: our posit `+bit` +only rounds nearest-even, but valid endpoints must round *outward* — the lower +bound toward −∞, the upper toward +∞ — to keep the enclosure sound. So we need: + + - a round-toward-±∞ posit encoder (round-down / round-up of an exact value), + - `+next` / `+prior` (lexicographic successor / predecessor of a posit bit + pattern — already a TODO in §1 above; `+(p)` / `(dec p)` with NaR/extreme + handling), + - the ubit then records residual openness after directed rounding. + +Everything else (endpoint add/sub/mul/div) is the existing posit arithmetic. + +## Open design questions (pin these BEFORE writing code) + +1. **Layout / width.** Does `@rvb`/`@rvh`/`@rvs` mean a valid *built from two + `posit{8,16,32}` endpoints*, or Gustafson's "valid⟨2n⟩ = two n-bit + ubit-posits" packed into a byte/half/single? The byte/half/single aura names + mirror posits and are currently ambiguous. **First decision.** +2. **Projective vs bounded.** Full Gustafson (wrapping / exterior intervals on + the projective ring) — the "real" valid — vs a simpler bounded `[lo,hi]` + interval arithmetic that covers most numeric use with far less machinery. +3. **Set operations & special values.** intersection / union / complement / + `is-empty` / `is-everything`, plus the special encodings (empty set, + all-reals, NaR). +4. **Comparisons.** Posit ordering == two's-complement of the raw bits; that + identity does **not** carry to valids — comparison becomes set / containment + relations, needs its own design. + +## Verification posture (harder than posits) + +- **No SoftPosit oracle** — SoftPosit is posits + quires only. Plan mirrors the + transcendentals: a from-scratch **exact-rational interval reference** (Python + intervals of `Fraction`s with directed rounding), and optionally **Stillwater + `Universal`** (C++), which does implement `valid<>` types, as a second oracle. +- **Standardization caveat.** The 2022 Posit Standard normatively pins posits + and quires; valids are described in Gustafson's broader work but are *not* + specified to the same degree. So this is a clean-room *design* of both the + Hoon `/lib/unum` arms and the C — NOT a transliteration of a fixed spec, which + is a different (looser) posture from everything done so far, where `/lib/unum` + was the bit-exact target. diff --git a/libmath/desk/lib/unum.hoon b/libmath/desk/lib/unum.hoon index badf4aa..2f1926a 100644 --- a/libmath/desk/lib/unum.hoon +++ b/libmath/desk/lib/unum.hoon @@ -31,6 +31,7 @@ :: gates of the same name. Internal integer arithmetic therefore uses the :: ^-prefixed forms (^add/^sub/^mul/^div) to reach the standard library. :: +~% %non ..part ~ :: nest non in hex for now (jet chapter, see /lib/math) |% :: G-layer (decoded) representation of a posit, mirroring the stdlib float +$fn :: (`[%f s=? e=@s a=@u]`): value = (sign) a * 2^e, with a an integer @@ -61,6 +62,7 @@ :: Specialize with %*: posit8 is bloq=3 (n=8), posit16 bloq=4, posit32 bloq=5. :: ++ pp + ~/ %unum |_ =bloq ++ n (bex bloq) ++ es 2 @@ -168,15 +170,32 @@ ++ gte-s |=([a=@s b=@s] ^-(? !=(-1 (cmp:si a b)))) ++ lte-s |=([a=@s b=@s] ^-(? !=(--1 (cmp:si a b)))) :: Comparisons (= two's-complement integer ordering of the raw bits, sec 5.3). - ++ gth |=([a=@ b=@] ^-(? (~(gth twoc:twoc bloq) a b))) - ++ lth |=([a=@ b=@] ^-(? (~(lth twoc:twoc bloq) a b))) - ++ gte |=([a=@ b=@] ^-(? (~(gte twoc:twoc bloq) a b))) - ++ lte |=([a=@ b=@] ^-(? (~(lte twoc:twoc bloq) a b))) - ++ equ |=([a=@ b=@] ^-(? =(a b))) - ++ neq |=([a=@ b=@] ^-(? !=(a b))) - ++ neg |=(a=@ ^-(@ (dis msk (^sub (bex n) a)))) - ++ abs |=(a=@ ^-(@ ?:(=(1 (~(msb twoc:twoc bloq) a)) (neg a) a))) + ++ gth + ~/ %gth + |=([a=@ b=@] ^-(? (~(gth twoc:twoc bloq) a b))) + ++ lth + ~/ %lth + |=([a=@ b=@] ^-(? (~(lth twoc:twoc bloq) a b))) + ++ gte + ~/ %gte + |=([a=@ b=@] ^-(? (~(gte twoc:twoc bloq) a b))) + ++ lte + ~/ %lte + |=([a=@ b=@] ^-(? (~(lte twoc:twoc bloq) a b))) + ++ equ + ~/ %equ + |=([a=@ b=@] ^-(? =(a b))) + ++ neq + ~/ %neq + |=([a=@ b=@] ^-(? !=(a b))) + ++ neg + ~/ %neg + |=(a=@ ^-(@ (dis msk (^sub (bex n) a)))) + ++ abs + ~/ %abs + |=(a=@ ^-(@ ?:(=(1 (~(msb twoc:twoc bloq) a)) (neg a) a))) ++ sgn + ~/ %sgn |= a=@ ^- @ ?: =(zero a) zero @@ -185,6 +204,7 @@ one :: Arithmetic (sec 5.4); exact g-layer combine, single round via +bit. ++ mul + ~/ %mul |= [a=@ b=@] ^- @ =/ ua (sea a) @@ -195,6 +215,7 @@ ?> ?=(%p -.ub) (bit [%p =(s.ua s.ub) (sum:si e.ua e.ub) (^mul a.ua a.ub)]) ++ add + ~/ %add |= [a=@ b=@] ^- @ =/ ua (sea a) @@ -214,8 +235,11 @@ ?: (^gth s2 s1) (bit [%p s.ub emin (^sub s2 s1)]) zero - ++ sub |=([a=@ b=@] ^-(@ (add a (neg b)))) + ++ sub + ~/ %sub + |=([a=@ b=@] ^-(@ (add a (neg b)))) ++ div + ~/ %div |= [a=@ b=@] ^- @ =/ ua (sea a) @@ -245,6 +269,7 @@ r $(r nr) ++ sqt + ~/ %sqt |= p=@ ^- @ =/ u (sea p) @@ -277,17 +302,28 @@ =? hi &(?=(%up mode) s.u !=(0 rem)) +(hi) ?: =(0 hi) zero (bit [%p s.u --0 hi]) - ++ rnd (round %near) - ++ flr (round %down) - ++ cel (round %up) - ++ sun |=(v=@ ^-(@ ?:(=(0 v) zero (bit [%p %.y --0 v])))) + :: rnd/flr/cel: eta-expanded to unary gates wrapping +round so they jet. + ++ rnd + ~/ %rnd + |=(p=@ ((round %near) p)) + ++ flr + ~/ %flr + |=(p=@ ((round %down) p)) + ++ cel + ~/ %cel + |=(p=@ ((round %up) p)) + ++ sun + ~/ %sun + |=(v=@ ^-(@ ?:(=(0 v) zero (bit [%p %.y --0 v])))) ++ san + ~/ %san |= v=@s ^- @ =+ [sn mg]=(old:si v) ?: =(0 mg) zero (bit [%p sn --0 mg]) ++ toi + ~/ %toi |= p=@ ^- (unit @s) =/ u (sea p) @@ -300,6 +336,7 @@ =/ mag ?:((gte-s e.ur --0) (lsh [0 sh] a.ur) (rsh [0 sh] a.ur)) `(new:si s.ur mag) ++ fma + ~/ %fma |= [a=@ b=@ c=@] ^- @ =/ ua (sea a) @@ -330,6 +367,7 @@ :: :: +exp: @ -> @ (e^x = sum x^k/k!) ++ exp + ~/ %exp |= x=@ ^- @ =/ sum one @@ -342,6 +380,7 @@ $(nn +(nn)) :: +sin: @ -> @ (sum (-1)^k x^(2k+1)/(2k+1)!) ++ sin + ~/ %sin |= x=@ ^- @ =/ term x @@ -355,6 +394,7 @@ $(nn +(nn)) :: +cos: @ -> @ (sum (-1)^k x^2k/(2k)!) ++ cos + ~/ %cos |= x=@ ^- @ =/ term one @@ -367,9 +407,12 @@ =. sum (add sum term) $(nn +(nn)) :: +tan: @ -> @ - ++ tan |=(x=@ ^-(@ (div (sin x) (cos x)))) + ++ tan + ~/ %tan + |=(x=@ ^-(@ (div (sin x) (cos x)))) :: +pow-n: @ -> @u -> @ (integer power by repeated multiplication) ++ pow-n + ~/ %pow-n |= [x=@ p=@u] ^- @ ?: =(nar x) nar :: NaR propagates even when p=0 @@ -382,6 +425,7 @@ :: bit pattern, and posit zero) returns NaR rather than a divergent :: series result. Like the rest of /lib/math, accurate only near 1. ++ log + ~/ %log |= x=@ ^- @ ?: (lte x zero) nar @@ -397,14 +441,21 @@ =. sum (add sum (mul coef term)) $(nn +(nn)) :: +log-2 / +log-10: base-2 / base-10 logarithm - ++ log-2 |=(x=@ ^-(@ (div (log x) log2))) - ++ log-10 |=(x=@ ^-(@ (div (log x) log10))) + ++ log-2 + ~/ %log-2 + |=(x=@ ^-(@ (div (log x) log2))) + ++ log-10 + ~/ %log-10 + |=(x=@ ^-(@ (div (log x) log10))) :: +pow: @ -> @ -> @ (x^y = exp(y * log x)) - ++ pow |=([x=@ y=@] ^-(@ (exp (mul y (log x))))) + ++ pow + ~/ %pow + |=([x=@ y=@] ^-(@ (exp (mul y (log x))))) :: +factorial: @ -> @ (x! by repeated multiplication) :: Domain x >= 0 (NaR otherwise, and NaR propagates); for integer x this is :: exact up to the precision, halting once x <= 1. Mirrors /lib/math. ++ factorial + ~/ %factorial |= x=@ ^- @ ?: =(nar x) nar @@ -417,6 +468,7 @@ :: Domain x > 0 (NaR for x < 0, like the rest of the exp/log-based ops); :: cbrt(0) = 0. Mirrors /lib/math's `cbt = (pow x .0.33...)`. ++ cbrt + ~/ %cbrt |= x=@ ^- @ ?: =(nar x) nar @@ -428,6 +480,7 @@ :: (1+x^2)^-0.5 and 1. Fixed iteration count (AGM converges quadratically). :: Odd in x, so negative arguments are handled by the carried sign. ++ atan + ~/ %atan |= x=@ ^- @ ?: =(nar x) nar @@ -446,6 +499,7 @@ :: arcsin(x) = atan(x / sqrt(1 - x^2)) for |x| < 1; +-pi/2 at x = +-1; :: NaR outside [-1, 1]. Mirrors /lib/math. ++ asin + ~/ %asin |= x=@ ^- @ ?: =(nar x) nar @@ -458,6 +512,7 @@ :: arccos(x) = atan(sqrt(1 - x^2) / x) for 0 < |x| < 1 (pi/2 at 0); 0 at :: x = 1, pi at x = -1; NaR outside [-1, 1]. Mirrors /lib/math. ++ acos + ~/ %acos |= x=@ ^- @ ?: =(nar x) nar @@ -468,7 +523,9 @@ ?: (equ x (neg one)) pi nar :: +is-close: @ -> @ -> @ -> ? (|a - b| <= tol) - ++ is-close |=([a=@ b=@ tol=@] ^-(? (lte (abs (sub a b)) tol))) + ++ is-close + ~/ %is-close + |=([a=@ b=@ tol=@] ^-(? (lte (abs (sub a b)) tol))) :: :: Quire (sec 3.4 / 5.11): a 16n-bit fixed-point exact accumulator, held :: as a raw two's-complement atom. Sums of products accumulate exactly and @@ -482,6 +539,7 @@ ++ q-nar (bex (dec qbits)) ++ q-zero `@`0 ++ p-to-q + ~/ %p-to-q |= p=@ ^- @ =/ u (sea p) @@ -491,6 +549,7 @@ =/ m (lsh [0 (abs:si (sum:si e.u (sun:si qscale)))] a.u) ?:(s.u m (^sub qmod m)) ++ q-to-p + ~/ %q-to-p |= q=@ ^- @ =. q (dis (dec qmod) q) @@ -500,6 +559,7 @@ ?: =(0 acc) zero (bit [%p !neg (dif:si --0 (sun:si qscale)) acc]) ++ q-mul-add + ~/ %q-mul-add |= [q=@ a=@ b=@] ^- @ ?: =(q-nar q) q-nar @@ -512,18 +572,30 @@ =/ m (lsh [0 (abs:si :(sum:si e.ua e.ub (sun:si qscale)))] (^mul a.ua a.ub)) =/ qc ?:(=(s.ua s.ub) m (^sub qmod m)) (mod (^add q qc) qmod) - ++ q-mul-sub |=([q=@ a=@ b=@] ^-(@ (q-mul-add q a (neg b)))) - ++ q-add-p |=([q=@ p=@] ^-(@ (q-mul-add q p one))) - ++ q-sub-p |=([q=@ p=@] ^-(@ (q-mul-add q (neg p) one))) - ++ q-negate |=(q=@ ^-(@ ?:(=(q-nar q) q-nar (mod (^sub qmod q) qmod)))) + ++ q-mul-sub + ~/ %q-mul-sub + |=([q=@ a=@ b=@] ^-(@ (q-mul-add q a (neg b)))) + ++ q-add-p + ~/ %q-add-p + |=([q=@ p=@] ^-(@ (q-mul-add q p one))) + ++ q-sub-p + ~/ %q-sub-p + |=([q=@ p=@] ^-(@ (q-mul-add q (neg p) one))) + ++ q-negate + ~/ %q-negate + |=(q=@ ^-(@ ?:(=(q-nar q) q-nar (mod (^sub qmod q) qmod)))) ++ q-add-q + ~/ %q-add-q |= [x=@ y=@] ^- @ ?: |(=(q-nar x) =(q-nar y)) q-nar (mod (^add x y) qmod) - ++ q-sub-q |=([x=@ y=@] ^-(@ (q-add-q x (q-negate y)))) + ++ q-sub-q + ~/ %q-sub-q + |=([x=@ y=@] ^-(@ (q-add-q x (q-negate y)))) :: +fdp: (list @) -> (list @) -> @ (fused dot product, single rounding) ++ fdp + ~/ %fdp |= [av=(list @) bv=(list @)] ^- @ =| q=@ @@ -552,15 +624,31 @@ ?: =(0 a.f) [%z ~] [%p s.f e.f a.f] :: +to-rh/rs/rd/rq: this-width posit -> half/single/double/quad float - ++ to-rh |=(p=@ ^-(@rh (bit:rh (up-to-fn (sea p))))) - ++ to-rs |=(p=@ ^-(@rs (bit:rs (up-to-fn (sea p))))) - ++ to-rd |=(p=@ ^-(@rd (bit:rd (up-to-fn (sea p))))) - ++ to-rq |=(p=@ ^-(@rq (bit:rq (up-to-fn (sea p))))) + ++ to-rh + ~/ %to-rh + |=(p=@ ^-(@rh (bit:rh (up-to-fn (sea p))))) + ++ to-rs + ~/ %to-rs + |=(p=@ ^-(@rs (bit:rs (up-to-fn (sea p))))) + ++ to-rd + ~/ %to-rd + |=(p=@ ^-(@rd (bit:rd (up-to-fn (sea p))))) + ++ to-rq + ~/ %to-rq + |=(p=@ ^-(@rq (bit:rq (up-to-fn (sea p))))) :: +from-rh/rs/rd/rq: half/single/double/quad float -> this-width posit - ++ from-rh |=(r=@rh ^-(@ (bit (fn-to-up (sea:rh r))))) - ++ from-rs |=(r=@rs ^-(@ (bit (fn-to-up (sea:rs r))))) - ++ from-rd |=(r=@rd ^-(@ (bit (fn-to-up (sea:rd r))))) - ++ from-rq |=(r=@rq ^-(@ (bit (fn-to-up (sea:rq r))))) + ++ from-rh + ~/ %from-rh + |=(r=@rh ^-(@ (bit (fn-to-up (sea:rh r))))) + ++ from-rs + ~/ %from-rs + |=(r=@rs ^-(@ (bit (fn-to-up (sea:rs r))))) + ++ from-rd + ~/ %from-rd + |=(r=@rd ^-(@ (bit (fn-to-up (sea:rd r))))) + ++ from-rq + ~/ %from-rq + |=(r=@rq ^-(@ (bit (fn-to-up (sea:rq r))))) -- :: posit8 ("byte"), posit<8,2> ++ rpb %*(. pp bloq 3) diff --git a/libmath/vere/README.md b/libmath/vere/README.md new file mode 100644 index 0000000..06bc36f --- /dev/null +++ b/libmath/vere/README.md @@ -0,0 +1,40 @@ +# `/lib/unum` jets — vere reference + +Reference (master) copies of the hand-maintained C jet sources for the posit +(`/lib/unum`) jets, mirrored by hand into the vere runtime (`urbit/vere`, +branch `sigilante/unum-jets`). SoftUnum itself is *vendored* into vere +(`ext/softunum`, pinned tarball of `sigilante/SoftUnum`), so the only +hand-synced jet file is `unum.c`. + +## Files here + +- `noun/jets/i/unum.c` — the jet wrappers. One `%unum` core, each op + dispatched on `bloq` read from the `pp` door sample (gate axis 30), calling + SoftUnum `p8_*`/`p16_*`/`p32_*`. posit64/128 (bloq 6/7) return `u3_none` + (fall back to the pure-Hoon arm) until SoftUnum covers them. + +## Deltas applied in vere (not full copies — see the vere branch/PR) + +- `ext/softunum/{build.zig,build.zig.zon}` — vendor SoftUnum (mirror + `ext/softblas`); wired into `pkg/noun/build.zig{,.zon}`. +- `pkg/noun/build.zig` — add `jets/i/unum.c` to the noun sources. +- `pkg/noun/jets/w.h`, `q.h` — declare `u3wi_unum_*` / `u3qi_unum_*`. +- `pkg/noun/jets/135/tree.c` — register `non/unum/` in the **hoon-135** + dashboard. We ship to the lowest (current) kelvin only; the 408k pill boots + hoon-135, so the 135 dashboard is the one consulted. (Local testing against + newer kelvins may also touch `136/137/tree.c`, but those are NOT part of the + shipped change — only 135.) + +## Hoon side (`libmath/desk/lib/unum.hoon`) + +The library is jet-hinted to match: `~% %non ..part ~` on the file core, +`~/ %unum` on `++pp`, and `~/ %` on each jetted arm (e.g. `~/ %add`). + +## Status + +`++add:pp` **fires bit-exact and verified on a live hoon-135 fakezod**: +`(add:rpb:unum 0x40 0x40)` → `0x48`, `add:rph` → `0x4800`, `add:rps` → +`0x4800.0000`, each dispatching the right width (bloq 3/4/5 read from the door +sample at gate axis 30); posit64 (bloq 6) declines to the pure-Hoon arm and +still returns the correct value. The remaining scalar/quire/conversion arms +follow the same one-jet-per-op, bloq-dispatch pattern. diff --git a/libmath/vere/noun/jets/i/unum.c b/libmath/vere/noun/jets/i/unum.c new file mode 100644 index 0000000..283a16b --- /dev/null +++ b/libmath/vere/noun/jets/i/unum.c @@ -0,0 +1,579 @@ +/// @file +/// +/// Jets for the numerics `/lib/unum` library (2022 Posit Standard; userspace, +/// registered under the `non` chapter alongside `math` and `lagoon`). Each jet +/// calls SoftUnum (ext/softunum), the bit-exact C twin of `/lib/unum`, so jet +/// output is identical to the unjetted Hoon. +/// +/// `/lib/unum` is one generic posit core `++pp` (`|_ =bloq`) specialized into +/// `rpb`/`rph`/`rps`/`rpd`/`rpq` (posit8/16/32/64/128) via `%*`. We register a +/// SINGLE `%unum` core and dispatch on `bloq` read from the door sample (the +/// same one-jet-per-op, runtime-dispatch shape `lagoon` uses). bloq lives in +/// the `pp` door, which is the gate's context: gate axis 7 = door, door axis 6 +/// = `=bloq`, so bloq is at gate axis 30. SoftUnum covers bloq 3/4/5 +/// (posit8/16/32); for bloq 6/7 (posit64/128, not yet in SoftUnum) the jet +/// returns u3_none and the pure-Hoon arm runs. +/// +/// Marshalling uses chub reads/writes (word-size-agnostic across the 32- and +/// 64-bit runtimes). Posit bit patterns occupy the low n bits. +/// +/// MASTER COPY lives in urbit/numerics libmath/vere/noun/jets/i/unum.c; applied +/// by hand to the vere runtime. SoftUnum itself is vendored (ext/softunum). + +#include "jets/q.h" +#include "jets/w.h" +#include "noun.h" +#include "softunum.h" + +// bloq from the pp door sample: gate axis 7 = door, door axis 6 = =bloq. +#define _UNUM_BLOQ_AXIS 30 + +// Read bloq (the door sample) from a gate core; c3n if absent/not-atom. +static inline c3_t +_unum_bloq(u3_noun cor, c3_d* out) +{ + u3_noun b = u3r_at(_UNUM_BLOQ_AXIS, cor); + if ( u3_none == b || c3n == u3ud(b) ) { + return c3n; + } + *out = u3r_chub(0, b); + return c3y; +} + +// binary posit -> posit op (add/sub/mul/div), bloq-dispatched. +#define _UNUM_BINOP(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom a, u3_atom b) { \ + c3_d ua = u3r_chub(0, a), ub = u3r_chub(0, b), r; \ + switch ( bloq ) { \ + case 3: r = f8((posit8_t)ua, (posit8_t)ub); break; \ + case 4: r = f16((posit16_t)ua, (posit16_t)ub); break; \ + case 5: r = f32((posit32_t)ua, (posit32_t)ub); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(1, &r); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun a, b; c3_d bloq; \ + if ( c3n == u3r_mean(cor, u3x_sam_2, &a, u3x_sam_3, &b, 0) || \ + c3n == u3ud(a) || c3n == u3ud(b) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, a, b); \ + } + +// binary posit -> loobean comparison; returns & (c3y) / | (c3n). +#define _UNUM_CMP(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom a, u3_atom b) { \ + c3_d ua = u3r_chub(0, a), ub = u3r_chub(0, b); c3_t v; \ + switch ( bloq ) { \ + case 3: v = f8((posit8_t)ua, (posit8_t)ub); break; \ + case 4: v = f16((posit16_t)ua, (posit16_t)ub); break; \ + case 5: v = f32((posit32_t)ua, (posit32_t)ub); break; \ + default: return u3_none; \ + } \ + return v ? c3y : c3n; \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun a, b; c3_d bloq; \ + if ( c3n == u3r_mean(cor, u3x_sam_2, &a, u3x_sam_3, &b, 0) || \ + c3n == u3ud(a) || c3n == u3ud(b) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, a, b); \ + } + +// unary posit -> posit op (neg/abs/sgn/sqt), bloq-dispatched. +#define _UNUM_UNOP(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom a) { \ + c3_d ua = u3r_chub(0, a), r; \ + switch ( bloq ) { \ + case 3: r = f8((posit8_t)ua); break; \ + case 4: r = f16((posit16_t)ua); break; \ + case 5: r = f32((posit32_t)ua); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(1, &r); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun a = u3r_at(u3x_sam, cor); c3_d bloq; \ + if ( u3_none == a || c3n == u3ud(a) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, a); \ + } + +_UNUM_BINOP(add, p8_add, p16_add, p32_add) +_UNUM_BINOP(sub, p8_sub, p16_sub, p32_sub) +_UNUM_BINOP(mul, p8_mul, p16_mul, p32_mul) +_UNUM_BINOP(div, p8_div, p16_div, p32_div) + +_UNUM_CMP(lth, p8_lt, p16_lt, p32_lt) +_UNUM_CMP(lte, p8_le, p16_le, p32_le) +_UNUM_CMP(gth, p8_gt, p16_gt, p32_gt) +_UNUM_CMP(gte, p8_ge, p16_ge, p32_ge) +_UNUM_CMP(equ, p8_eq, p16_eq, p32_eq) +_UNUM_CMP(neq, !p8_eq, !p16_eq, !p32_eq) + +_UNUM_UNOP(neg, p8_neg, p16_neg, p32_neg) +_UNUM_UNOP(abs, p8_abs, p16_abs, p32_abs) +_UNUM_UNOP(sgn, p8_sgn, p16_sgn, p32_sgn) +_UNUM_UNOP(sqt, p8_sqrt, p16_sqrt, p32_sqrt) + +/* ++fma:pp -- fused multiply-add (a*b + c), single rounding. Ternary gate: +** sample [a b c] = [a [b c]]: a @ sam_2, b @ sam_6, c @ sam_7. +*/ + u3_noun + u3qi_unum_fma(c3_d bloq, u3_atom a, u3_atom b, u3_atom c) + { + c3_d ua = u3r_chub(0, a), ub = u3r_chub(0, b), uc = u3r_chub(0, c), r; + switch ( bloq ) { + case 3: r = p8_fma((posit8_t)ua, (posit8_t)ub, (posit8_t)uc); break; + case 4: r = p16_fma((posit16_t)ua, (posit16_t)ub, (posit16_t)uc); break; + case 5: r = p32_fma((posit32_t)ua, (posit32_t)ub, (posit32_t)uc); break; + default: return u3_none; + } + return u3i_chubs(1, &r); + } + + u3_noun + u3wi_unum_fma(u3_noun cor) + { + u3_noun a, b, c; c3_d bloq; + if ( c3n == u3r_mean(cor, u3x_sam_2, &a, u3x_sam_6, &b, u3x_sam_7, &c, 0) || + c3n == u3ud(a) || c3n == u3ud(b) || c3n == u3ud(c) ) { + return u3m_bail(c3__exit); + } + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_fma(bloq, a, b, c); + } + +// Elementary / transcendental functions. Unary ones (incl. log-2/log-10, +// which map to SoftUnum's base-2/base-10 log fns) reuse the unary template; +// pow is binary; pow-n is posit ^ @u (the exponent is a raw integer, not a +// posit). Constants (pi/e/...) are left to pure Hoon, like /lib/math. +_UNUM_UNOP(exp, p8_exp, p16_exp, p32_exp) +_UNUM_UNOP(sin, p8_sin, p16_sin, p32_sin) +_UNUM_UNOP(cos, p8_cos, p16_cos, p32_cos) +_UNUM_UNOP(tan, p8_tan, p16_tan, p32_tan) +_UNUM_UNOP(log, p8_log, p16_log, p32_log) +_UNUM_UNOP(log2, p8_log2, p16_log2, p32_log2) +_UNUM_UNOP(log10, p8_log10, p16_log10, p32_log10) +_UNUM_UNOP(cbrt, p8_cbrt, p16_cbrt, p32_cbrt) +_UNUM_UNOP(atan, p8_atan, p16_atan, p32_atan) +_UNUM_UNOP(asin, p8_asin, p16_asin, p32_asin) +_UNUM_UNOP(acos, p8_acos, p16_acos, p32_acos) +_UNUM_UNOP(factorial, p8_factorial, p16_factorial, p32_factorial) +_UNUM_BINOP(pow, p8_pow, p16_pow, p32_pow) + +/* ++pow-n:pp -- posit ^ @u (integer power); the exponent is a raw unsigned +** integer, not a posit, so it is read as a plain chub. +*/ + u3_noun + u3qi_unum_pow_n(c3_d bloq, u3_atom x, u3_atom p) + { + c3_d ux = u3r_chub(0, x), up = u3r_chub(0, p), r; + switch ( bloq ) { + case 3: r = p8_pow_n((posit8_t)ux, up); break; + case 4: r = p16_pow_n((posit16_t)ux, up); break; + case 5: r = p32_pow_n((posit32_t)ux, up); break; + default: return u3_none; + } + return u3i_chubs(1, &r); + } + + u3_noun + u3wi_unum_pow_n(u3_noun cor) + { + u3_noun x, p; c3_d bloq; + if ( c3n == u3r_mean(cor, u3x_sam_2, &x, u3x_sam_3, &p, 0) || + c3n == u3ud(x) || c3n == u3ud(p) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_pow_n(bloq, x, p); + } + +// Rounding to integral value. The Hoon rnd/flr/cel are eta-expanded into +// unary gates wrapping +round, so they jet via the unary template. +_UNUM_UNOP(rnd, p8_nearest_int, p16_nearest_int, p32_nearest_int) +_UNUM_UNOP(flr, p8_floor, p16_floor, p32_floor) +_UNUM_UNOP(cel, p8_ceil, p16_ceil, p32_ceil) + +/* ++sun:pp -- @u -> posit. The argument is a raw unsigned integer (read as a +** full chub), NOT a posit pattern, so it is not masked to the posit width. +*/ + u3_noun + u3qi_unum_sun(c3_d bloq, u3_atom v) + { + c3_d uv = u3r_chub(0, v), r; + switch ( bloq ) { + case 3: r = p8_from_u64(uv); break; + case 4: r = p16_from_u64(uv); break; + case 5: r = p32_from_u64(uv); break; + default: return u3_none; + } + return u3i_chubs(1, &r); + } + u3_noun + u3wi_unum_sun(u3_noun cor) + { + u3_noun v = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == v || c3n == u3ud(v) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_sun(bloq, v); + } + +/* ++san:pp -- @s -> posit. Decode the Hoon signed atom (even 2m -> +m, +** odd 2m-1 -> -m) to a C int64, then encode. +*/ + u3_noun + u3qi_unum_san(c3_d bloq, u3_atom v) + { + c3_d uv = u3r_chub(0, v); + c3_ds sv = (uv & 1) ? -(c3_ds)((uv + 1) >> 1) : (c3_ds)(uv >> 1); + c3_d r; + switch ( bloq ) { + case 3: r = p8_from_i64(sv); break; + case 4: r = p16_from_i64(sv); break; + case 5: r = p32_from_i64(sv); break; + default: return u3_none; + } + return u3i_chubs(1, &r); + } + u3_noun + u3wi_unum_san(u3_noun cor) + { + u3_noun v = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == v || c3n == u3ud(v) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_san(bloq, v); + } + +/* ++toi:pp -- posit -> (unit @s). NaR -> ~ (none); else [~ @s] with the +** integer re-encoded into a Hoon signed atom (+m -> 2m, -m -> 2m-1). +*/ + u3_noun + u3qi_unum_toi(c3_d bloq, u3_atom p) + { + c3_d up = u3r_chub(0, p); + c3_ds out; c3_t ok; + switch ( bloq ) { + case 3: ok = p8_to_i64((posit8_t)up, (int64_t*)&out); break; + case 4: ok = p16_to_i64((posit16_t)up, (int64_t*)&out); break; + case 5: ok = p32_to_i64((posit32_t)up, (int64_t*)&out); break; + default: return u3_none; + } + if ( !ok ) return u3_nul; + c3_d sa = (out >= 0) ? ((c3_d)out << 1) : (((c3_d)(-out) << 1) - 1); + return u3nc(u3_nul, u3i_chubs(1, &sa)); + } + u3_noun + u3wi_unum_toi(u3_noun cor) + { + u3_noun p = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == p || c3n == u3ud(p) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_toi(bloq, p); + } + +/* ++is-close:pp -- |a - b| <= tol, a loobean. Ternary sample [a b tol]. +*/ + u3_noun + u3qi_unum_is_close(c3_d bloq, u3_atom a, u3_atom b, u3_atom tol) + { + c3_d ua = u3r_chub(0, a), ub = u3r_chub(0, b), ut = u3r_chub(0, tol); c3_t v; + switch ( bloq ) { + case 3: v = p8_is_close((posit8_t)ua, (posit8_t)ub, (posit8_t)ut); break; + case 4: v = p16_is_close((posit16_t)ua, (posit16_t)ub, (posit16_t)ut); break; + case 5: v = p32_is_close((posit32_t)ua, (posit32_t)ub, (posit32_t)ut); break; + default: return u3_none; + } + return v ? c3y : c3n; + } + u3_noun + u3wi_unum_is_close(u3_noun cor) + { + u3_noun a, b, tol; c3_d bloq; + if ( c3n == u3r_mean(cor, u3x_sam_2, &a, u3x_sam_6, &b, u3x_sam_7, &tol, 0) || + c3n == u3ud(a) || c3n == u3ud(b) || c3n == u3ud(tol) ) { + return u3m_bail(c3__exit); + } + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_is_close(bloq, a, b, tol); + } + +// IEEE-754 conversion (value-based). The posit width is bloq; the float width +// is fixed by the arm. to-rh/rs/rd and from-rh/rs/rd are 1-chub each side; +// to-rq / from-rq use the 128-bit (2-chub) binary128 pattern. +#define _UNUM_TO(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom p) { \ + c3_d up = u3r_chub(0, p), r; \ + switch ( bloq ) { \ + case 3: r = f8((posit8_t)up); break; \ + case 4: r = f16((posit16_t)up); break; \ + case 5: r = f32((posit32_t)up); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(1, &r); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun p = u3r_at(u3x_sam, cor); c3_d bloq; \ + if ( u3_none == p || c3n == u3ud(p) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, p); \ + } + +#define _UNUM_FROM(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom r) { \ + c3_d ur = u3r_chub(0, r), v; \ + switch ( bloq ) { \ + case 3: v = f8(ur); break; \ + case 4: v = f16(ur); break; \ + case 5: v = f32(ur); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(1, &v); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun r = u3r_at(u3x_sam, cor); c3_d bloq; \ + if ( u3_none == r || c3n == u3ud(r) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, r); \ + } + +_UNUM_TO(to_rh, p8_to_rh, p16_to_rh, p32_to_rh) +_UNUM_TO(to_rs, p8_to_rs, p16_to_rs, p32_to_rs) +_UNUM_TO(to_rd, p8_to_rd, p16_to_rd, p32_to_rd) +_UNUM_FROM(from_rh, p8_from_rh, p16_from_rh, p32_from_rh) +_UNUM_FROM(from_rs, p8_from_rs, p16_from_rs, p32_from_rs) +_UNUM_FROM(from_rd, p8_from_rd, p16_from_rd, p32_from_rd) + +/* ++to-rq:pp -- posit -> binary128 (2-chub result). +*/ + u3_noun + u3qi_unum_to_rq(c3_d bloq, u3_atom p) + { + c3_d up = u3r_chub(0, p), out[2]; + switch ( bloq ) { + case 3: p8_to_rq((posit8_t)up, out); break; + case 4: p16_to_rq((posit16_t)up, out); break; + case 5: p32_to_rq((posit32_t)up, out); break; + default: return u3_none; + } + return u3i_chubs(2, out); + } + u3_noun + u3wi_unum_to_rq(u3_noun cor) + { + u3_noun p = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == p || c3n == u3ud(p) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_to_rq(bloq, p); + } + +/* ++from-rq:pp -- binary128 (2-chub input) -> posit. +*/ + u3_noun + u3qi_unum_from_rq(c3_d bloq, u3_atom r) + { + c3_d in[2], v; + in[0] = u3r_chub(0, r); + in[1] = u3r_chub(1, r); + switch ( bloq ) { + case 3: v = p8_from_rq(in); break; + case 4: v = p16_from_rq(in); break; + case 5: v = p32_from_rq(in); break; + default: return u3_none; + } + return u3i_chubs(1, &v); + } + u3_noun + u3wi_unum_from_rq(u3_noun cor) + { + u3_noun r = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == r || c3n == u3ud(r) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_from_rq(bloq, r); + } + +// Quire: a 16n-bit exact accumulator, marshalled to/from SoftUnum's uint64 +// word array (QW = n/4 words: 2/4/8 for posit8/16/32). The accumulate ops +// mutate the buffer in place; we read QW chubs in and write QW chubs out. +#define _UNUM_QW(bloq) (1 << ((bloq) - 2)) + + static void + _unum_qload(u3_atom q, c3_d *buf, int qw) + { + for ( int i = 0; i < qw; i++ ) buf[i] = u3r_chub(i, q); + } + + u3_noun + u3qi_unum_p_to_q(c3_d bloq, u3_atom p) + { + c3_d buf[8] = {0}, up = u3r_chub(0, p); int qw = _UNUM_QW(bloq); + switch ( bloq ) { + case 3: p8_p_to_q((posit8_t)up, buf); break; + case 4: p16_p_to_q((posit16_t)up, buf); break; + case 5: p32_p_to_q((posit32_t)up, buf); break; + default: return u3_none; + } + return u3i_chubs(qw, buf); + } + u3_noun + u3wi_unum_p_to_q(u3_noun cor) + { + u3_noun p = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == p || c3n == u3ud(p) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_p_to_q(bloq, p); + } + + u3_noun + u3qi_unum_q_to_p(c3_d bloq, u3_atom q) + { + c3_d buf[8] = {0}, r; + if ( bloq < 3 || bloq > 5 ) return u3_none; + _unum_qload(q, buf, _UNUM_QW(bloq)); + switch ( bloq ) { + case 3: r = p8_q_to_p(buf); break; + case 4: r = p16_q_to_p(buf); break; + default: r = p32_q_to_p(buf); break; + } + return u3i_chubs(1, &r); + } + u3_noun + u3wi_unum_q_to_p(u3_noun cor) + { + u3_noun q = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == q || c3n == u3ud(q) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_q_to_p(bloq, q); + } + + u3_noun + u3qi_unum_q_negate(c3_d bloq, u3_atom q) + { + c3_d buf[8] = {0}; int qw = _UNUM_QW(bloq); + if ( bloq < 3 || bloq > 5 ) return u3_none; + _unum_qload(q, buf, qw); + switch ( bloq ) { + case 3: p8_q_negate(buf); break; + case 4: p16_q_negate(buf); break; + default: p32_q_negate(buf); break; + } + return u3i_chubs(qw, buf); + } + u3_noun + u3wi_unum_q_negate(u3_noun cor) + { + u3_noun q = u3r_at(u3x_sam, cor); c3_d bloq; + if ( u3_none == q || c3n == u3ud(q) ) return u3m_bail(c3__exit); + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_q_negate(bloq, q); + } + +// q-mul-add / q-mul-sub: (quire, posit, posit) -> quire. +#define _UNUM_QMA(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom q, u3_atom a, u3_atom b) { \ + c3_d buf[8] = {0}, ua = u3r_chub(0, a), ub = u3r_chub(0, b); \ + int qw = _UNUM_QW(bloq); \ + _unum_qload(q, buf, qw); \ + switch ( bloq ) { \ + case 3: f8(buf, (posit8_t)ua, (posit8_t)ub); break; \ + case 4: f16(buf, (posit16_t)ua, (posit16_t)ub); break; \ + case 5: f32(buf, (posit32_t)ua, (posit32_t)ub); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(qw, buf); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun q, a, b; c3_d bloq; \ + if ( c3n == u3r_mean(cor, u3x_sam_2, &q, u3x_sam_6, &a, u3x_sam_7, &b, 0) || \ + c3n == u3ud(q) || c3n == u3ud(a) || c3n == u3ud(b) ) \ + return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, q, a, b); \ + } + +_UNUM_QMA(q_mul_add, p8_q_mul_add, p16_q_mul_add, p32_q_mul_add) +_UNUM_QMA(q_mul_sub, p8_q_mul_sub, p16_q_mul_sub, p32_q_mul_sub) + +// q-add-p / q-sub-p: (quire, posit) -> quire. +#define _UNUM_QAP(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom q, u3_atom p) { \ + c3_d buf[8] = {0}, up = u3r_chub(0, p); int qw = _UNUM_QW(bloq); \ + _unum_qload(q, buf, qw); \ + switch ( bloq ) { \ + case 3: f8(buf, (posit8_t)up); break; \ + case 4: f16(buf, (posit16_t)up); break; \ + case 5: f32(buf, (posit32_t)up); break; \ + default: return u3_none; \ + } \ + return u3i_chubs(qw, buf); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun q, p; c3_d bloq; \ + if ( c3n == u3r_mean(cor, u3x_sam_2, &q, u3x_sam_3, &p, 0) || \ + c3n == u3ud(q) || c3n == u3ud(p) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, q, p); \ + } + +_UNUM_QAP(q_add_p, p8_q_add_p, p16_q_add_p, p32_q_add_p) +_UNUM_QAP(q_sub_p, p8_q_sub_p, p16_q_sub_p, p32_q_sub_p) + +// q-add-q / q-sub-q: (quire, quire) -> quire. +#define _UNUM_QAQ(nam, f8, f16, f32) \ + u3_noun u3qi_unum_##nam(c3_d bloq, u3_atom x, u3_atom y) { \ + c3_d xb[8] = {0}, yb[8] = {0}; int qw = _UNUM_QW(bloq); \ + if ( bloq < 3 || bloq > 5 ) return u3_none; \ + _unum_qload(x, xb, qw); _unum_qload(y, yb, qw); \ + switch ( bloq ) { \ + case 3: f8(xb, yb); break; \ + case 4: f16(xb, yb); break; \ + default: f32(xb, yb); break; \ + } \ + return u3i_chubs(qw, xb); \ + } \ + u3_noun u3wi_unum_##nam(u3_noun cor) { \ + u3_noun x, y; c3_d bloq; \ + if ( c3n == u3r_mean(cor, u3x_sam_2, &x, u3x_sam_3, &y, 0) || \ + c3n == u3ud(x) || c3n == u3ud(y) ) return u3m_bail(c3__exit); \ + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; \ + return u3qi_unum_##nam(bloq, x, y); \ + } + +_UNUM_QAQ(q_add_q, p8_q_add_q, p16_q_add_q, p32_q_add_q) +_UNUM_QAQ(q_sub_q, p8_q_sub_q, p16_q_sub_q, p32_q_sub_q) + +/* ++fdp:pp -- fused dot product of two posit lists (single rounding). We +** accumulate directly through a quire buffer (q-mul-add per element, q-to-p +** once), zipping to the shorter list -- matching the Hoon. +*/ + u3_noun + u3qi_unum_fdp(c3_d bloq, u3_noun av, u3_noun bv) + { + c3_d buf[8] = {0}, r; + if ( bloq < 3 || bloq > 5 ) return u3_none; + u3_noun ta = av, tb = bv; + while ( (c3y == u3du(ta)) && (c3y == u3du(tb)) ) { + c3_d a = u3r_chub(0, u3h(ta)), b = u3r_chub(0, u3h(tb)); + switch ( bloq ) { + case 3: p8_q_mul_add(buf, (posit8_t)a, (posit8_t)b); break; + case 4: p16_q_mul_add(buf, (posit16_t)a, (posit16_t)b); break; + default: p32_q_mul_add(buf, (posit32_t)a, (posit32_t)b); break; + } + ta = u3t(ta); tb = u3t(tb); + } + switch ( bloq ) { + case 3: r = p8_q_to_p(buf); break; + case 4: r = p16_q_to_p(buf); break; + default: r = p32_q_to_p(buf); break; + } + return u3i_chubs(1, &r); + } + u3_noun + u3wi_unum_fdp(u3_noun cor) + { + u3_noun av, bv; c3_d bloq; + if ( c3n == u3r_mean(cor, u3x_sam_2, &av, u3x_sam_3, &bv, 0) ) { + return u3m_bail(c3__exit); + } + if ( c3n == _unum_bloq(cor, &bloq) ) return u3_none; + return u3qi_unum_fdp(bloq, av, bv); + }