Skip to content

Commit 62b7543

Browse files
authored
Merge pull request #33 from cobaltgit/display-neon
Optimise display() to use ARM NEON for blitting, benefit most noticeable when CPU is at very low clocks (much less CPU cycles taken)
2 parents 274c0fb + 5f25c60 commit 62b7543

4 files changed

Lines changed: 210 additions & 66 deletions

File tree

src/common/fb.nim

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,10 @@ const
2020
arr[i] = uint8((i shl 2) or (i shr 4))
2121
arr
2222

23-
EightToFive*: array[256, uint8] = block:
24-
var arr: array[256, uint8]
25-
for i in 0..255:
26-
arr[i] = uint8(i shr 3)
27-
arr
28-
29-
EightToSix*: array[256, uint8] = block:
30-
var arr: array[256, uint8]
31-
for i in 0..255:
32-
arr[i] = uint8(i shr 2)
33-
arr
34-
35-
FbXBase*: array[FbHeight, int] = block:
36-
var arr: array[FbHeight, int]
23+
FbXBase*: array[FbHeight, uint32] = block:
24+
var arr: array[FbHeight, uint32]
3725
for x in 0..<FbHeight:
38-
arr[x] = (FbHeight - 1 - x) * FbWidth
26+
arr[x] = uint32((FbHeight - 1 - x) * FbWidth)
3927
arr
4028

4129
{.push optimization:speed, checks:off, warnings:off.}

src/common/ffi/neon_blit.c

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// NEON blitter for display
2+
// Generated by Claude Sonnet 4.6
3+
4+
#include <arm_neon.h>
5+
#include <stdint.h>
6+
7+
/* -------------------------------------------------------------------------
8+
* Cortex-A7 notes:
9+
*
10+
* - 64-bit NEON unit: q-register (128-bit) ops cost 2 cycles; d-register
11+
* (64-bit) ops cost 1 cycle. The transpose uses d-registers throughout.
12+
* rgba_to_rgb565 must use q-registers for widened intermediates (8 lanes
13+
* of uint16 is 128 bits — there is no narrower option), so no savings
14+
* there, but the 16-pixel unroll saturates each 64-byte cache line.
15+
*
16+
* - In-order pipeline: cache misses stall directly. __builtin_prefetch
17+
* issues a PLD instruction to run ahead of the miss.
18+
*
19+
* - L1 cache line: 64 bytes = 16 RGBA pixels = 32 RGB565 pixels.
20+
* ------------------------------------------------------------------------- */
21+
22+
/* -------------------------------------------------------------------------
23+
* Phase 1: RGBA -> RGB565
24+
* 16 pixels per iteration = one full 64-byte cache line of RGBA input.
25+
* Intermediate values are necessarily q-registers (8×uint16 = 128 bits).
26+
* ------------------------------------------------------------------------- */
27+
void rgba_to_rgb565(const uint8_t* src, uint16_t* dst, int n)
28+
{
29+
/* Prefetch 4 cache lines ahead. On A7 the hardware prefetcher handles
30+
* sequential access well, but an explicit prefetch avoids any startup
31+
* latency at the beginning of the buffer. */
32+
for (; n >= 16; n -= 16, src += 64, dst += 16) {
33+
__builtin_prefetch(src + 256, 0, 1);
34+
35+
/* vld4_u8 deinterleaves RGBA into four d-registers (8 bytes each),
36+
* giving us channels for 8 pixels per call at no extra cost. */
37+
uint8x8x4_t px0 = vld4_u8(src);
38+
uint8x8x4_t px1 = vld4_u8(src + 32);
39+
40+
/* vmovl_u8: d-reg -> q-reg widen (unavoidable for 16-bit output).
41+
* vshlq_n_u16 and vorrq_u16 are q-reg ops (2 cycles each on A7),
42+
* but there is no d-register equivalent for this pipeline. */
43+
uint16x8_t r0 = vshlq_n_u16(vmovl_u8(vshr_n_u8(px0.val[0], 3)), 11);
44+
uint16x8_t g0 = vshlq_n_u16(vmovl_u8(vshr_n_u8(px0.val[1], 2)), 5);
45+
uint16x8_t b0 = vmovl_u8(vshr_n_u8(px0.val[2], 3));
46+
vst1q_u16(dst, vorrq_u16(vorrq_u16(r0, g0), b0));
47+
48+
uint16x8_t r1 = vshlq_n_u16(vmovl_u8(vshr_n_u8(px1.val[0], 3)), 11);
49+
uint16x8_t g1 = vshlq_n_u16(vmovl_u8(vshr_n_u8(px1.val[1], 2)), 5);
50+
uint16x8_t b1 = vmovl_u8(vshr_n_u8(px1.val[2], 3));
51+
vst1q_u16(dst + 8, vorrq_u16(vorrq_u16(r1, g1), b1));
52+
}
53+
54+
/* 8-pixel tail */
55+
for (; n >= 8; n -= 8, src += 32, dst += 8) {
56+
uint8x8x4_t px = vld4_u8(src);
57+
uint16x8_t r = vshlq_n_u16(vmovl_u8(vshr_n_u8(px.val[0], 3)), 11);
58+
uint16x8_t g = vshlq_n_u16(vmovl_u8(vshr_n_u8(px.val[1], 2)), 5);
59+
uint16x8_t b = vmovl_u8(vshr_n_u8(px.val[2], 3));
60+
vst1q_u16(dst, vorrq_u16(vorrq_u16(r, g), b));
61+
}
62+
63+
/* Scalar tail */
64+
for (; n > 0; n--, src += 4, dst++)
65+
*dst = ((uint16_t)(src[0] >> 3) << 11)
66+
| ((uint16_t)(src[1] >> 2) << 5)
67+
| (uint16_t)(src[2] >> 3);
68+
}
69+
70+
/* -------------------------------------------------------------------------
71+
* Phase 2: 4×4 tiled transpose using d-registers only.
72+
*
73+
* vtrn_u16 and vtrn_u32 are d-register ops (1 cycle on A7).
74+
* vst1_u16 is a d-register store (1 cycle on A7).
75+
* The 8×8 version used vtrnq/vst1q (2 cycles each) — this is cheaper.
76+
*
77+
* src_width and src_height must be multiples of 4.
78+
* 240 and 320 are both divisible by 4. ✓
79+
* ------------------------------------------------------------------------- */
80+
static inline void transpose_4x4(
81+
const uint16_t* src, int src_stride,
82+
uint16_t* dst, int dst_stride)
83+
{
84+
uint16x4_t r0 = vld1_u16(src + 0 * src_stride);
85+
uint16x4_t r1 = vld1_u16(src + 1 * src_stride);
86+
uint16x4_t r2 = vld1_u16(src + 2 * src_stride);
87+
uint16x4_t r3 = vld1_u16(src + 3 * src_stride);
88+
89+
/* Round 1: interleave adjacent u16 pairs (d-register, 1 cycle) */
90+
uint16x4x2_t q01 = vtrn_u16(r0, r1);
91+
uint16x4x2_t q23 = vtrn_u16(r2, r3);
92+
93+
/* Round 2: interleave u32 pairs to complete the transpose */
94+
uint32x2x2_t q0123e = vtrn_u32(vreinterpret_u32_u16(q01.val[0]),
95+
vreinterpret_u32_u16(q23.val[0]));
96+
uint32x2x2_t q0123o = vtrn_u32(vreinterpret_u32_u16(q01.val[1]),
97+
vreinterpret_u32_u16(q23.val[1]));
98+
99+
vst1_u16(dst + 0 * dst_stride, vreinterpret_u16_u32(q0123e.val[0]));
100+
vst1_u16(dst + 1 * dst_stride, vreinterpret_u16_u32(q0123o.val[0]));
101+
vst1_u16(dst + 2 * dst_stride, vreinterpret_u16_u32(q0123e.val[1]));
102+
vst1_u16(dst + 3 * dst_stride, vreinterpret_u16_u32(q0123o.val[1]));
103+
}
104+
105+
/*
106+
* blit_transposed — rotate a landscape RGB565 buffer 90° CW into a portrait
107+
* framebuffer using tiled 4×4 d-register NEON transposes.
108+
*
109+
* Pixel mapping:
110+
* src(lx, ly) → dst[(src_width - 1 - lx) * src_height + ly]
111+
*
112+
* For 320×240 source → 240×320 framebuffer:
113+
* dst[(319 - lx) * 240 + ly]
114+
*/
115+
void blit_transposed(const uint16_t* src, uint16_t* dst,
116+
int src_width, int src_height)
117+
{
118+
/* Source reads stride across rows by src_width elements (640 bytes),
119+
* which the A7's hardware prefetcher won't detect. We prefetch one
120+
* full tile-row ahead (4 rows × src_width elements). */
121+
const int prefetch_dist = 4 * src_width;
122+
123+
for (int ty = 0; ty < src_height; ty += 4) {
124+
const uint16_t* row = src + ty * src_width;
125+
126+
for (int tx = 0; tx < src_width; tx += 4) {
127+
const uint16_t* s = row + tx;
128+
129+
/* Prefetch the 4 source rows of the next tile-row.
130+
* Each covers 8 bytes (4×uint16), well within one cache line,
131+
* so one __builtin_prefetch per row is sufficient. */
132+
__builtin_prefetch(s + 0 * src_width + prefetch_dist, 0, 1);
133+
__builtin_prefetch(s + 1 * src_width + prefetch_dist, 0, 1);
134+
__builtin_prefetch(s + 2 * src_width + prefetch_dist, 0, 1);
135+
__builtin_prefetch(s + 3 * src_width + prefetch_dist, 0, 1);
136+
137+
uint16_t* d = dst + (src_width - 1 - tx) * src_height + ty;
138+
transpose_4x4(s, src_width, d, -src_height);
139+
}
140+
}
141+
}

src/common/ffi/neon_blit.nim

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{.compile: "neon_blit.c".}
2+
{.passC: "-mfpu=neon-vfpv4 -mfloat-abi=hard -O3".}
3+
4+
proc rgba_to_rgb565*(src: ptr UncheckedArray[uint8],
5+
dst: ptr UncheckedArray[uint16],
6+
n: cint) {.importc, noconv.}
7+
8+
proc blit_transposed*(src: ptr UncheckedArray[uint16],
9+
dst: ptr UncheckedArray[uint16],
10+
width, height: cint) {.importc, noconv.}

src/display.nim

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,39 @@ import std/[os, posix, strutils]
22
import nimPNG
33

44
import common/[fb, process]
5-
import common/ffi/stb_truetype
5+
import common/ffi/[stb_truetype, neon_blit]
66

77
const
8-
ScreenWidth = 320
8+
ScreenWidth = 320
99
ScreenHeight = 240
1010

1111
DefaultBackground* = "/mnt/SDCARD/System/res/quarkbg.png"
12-
DefaultFont* = "/mnt/SDCARD/System/res/TwCenMT.ttf"
13-
FontSize = 24.0
12+
DefaultFont* = "/mnt/SDCARD/System/res/TwCenMT.ttf"
13+
FontSize = 24.0
1414

15-
{.push optimization:speed, warnings:off.}
15+
{.push optimization: speed, warnings: off.}
1616

1717
var childPid: Pid = -1
1818

19+
var blitTmp: array[FbPixels, uint16]
20+
1921
proc toRGB565(r, g, b: uint8): uint16 {.inline.} =
20-
(uint16(EightToFive[r]) shl 11) or
21-
(uint16(EightToSix[g]) shl 5) or
22-
uint16(EightToFive[b])
22+
(uint16(r shr 3) shl 11) or
23+
(uint16(g shr 2) shl 5) or
24+
uint16(b shr 3)
2325

2426
proc fromRGB565(pixel: uint16, r, g, b: var uint8) {.inline.} =
2527
r = FiveToEight[(pixel shr 11) and 0x1F]
26-
g = SixToEight[(pixel shr 5) and 0x3F]
28+
g = SixToEight[(pixel shr 5) and 0x3F]
2729
b = FiveToEight[pixel and 0x1F]
2830

31+
proc blendRGB565(fg, bg: uint16, alpha: uint32): uint16 {.inline.} =
32+
let inv = 256'u32 - alpha
33+
let r = (uint32(fg shr 11) * alpha + uint32(bg shr 11) * inv) shr 8
34+
let g = ((uint32(fg) and 0x07E0'u32) * alpha + (uint32(bg) and 0x07E0'u32) * inv) shr 8
35+
let b = (uint32(fg and 0x1F'u16) * alpha + uint32(bg and 0x1F'u16) * inv) shr 8
36+
uint16((r shl 11) or (g and 0x07E0'u32) or b)
37+
2938
proc loadFont(path: string): seq[byte] =
3039
let f = open(path, fmRead)
3140
defer: f.close()
@@ -37,21 +46,21 @@ proc measureText(font: ptr stbtt_fontinfo, text: string, scale: cfloat): int =
3746
var x: cint = 0
3847
for i, ch in text:
3948
var advanceWidth, leftSideBearing: cint
40-
stbtt_GetCodepointHMetrics(font, cint(ch), addr advanceWidth, addr leftSideBearing)
49+
stbtt_GetCodepointHMetrics(font, cint(ch), addr advanceWidth,
50+
addr leftSideBearing)
4151
x += advanceWidth
4252
if i < text.len - 1:
43-
let kern = stbtt_GetCodepointKernAdvance(font, cint(ch), cint(text[i+1]))
44-
x += kern
53+
x += stbtt_GetCodepointKernAdvance(font, cint(ch), cint(text[i + 1]))
4554
result = int(cfloat(x) * scale)
4655

47-
proc wrapText(text: string, font: ptr stbtt_fontinfo, scale: cfloat, maxWidth: int): seq[string] =
56+
proc wrapText(text: string, font: ptr stbtt_fontinfo, scale: cfloat,
57+
maxWidth: int): seq[string] =
4858
result = @[]
4959
var currentLine = ""
5060
var currentWidth = 0
5161

5262
for word in text.split(' '):
5363
let wordWidth = measureText(font, word & " ", scale)
54-
5564
if currentWidth + wordWidth > maxWidth and currentLine.len > 0:
5665
result.add(currentLine.strip())
5766
currentLine = word & " "
@@ -63,19 +72,17 @@ proc wrapText(text: string, font: ptr stbtt_fontinfo, scale: cfloat, maxWidth: i
6372
if currentLine.len > 0:
6473
result.add(currentLine.strip())
6574

66-
proc renderTextLine(fb: ptr UncheckedArray[uint16], font: ptr stbtt_fontinfo,
67-
text: string, y: int, pixelHeight: float, color: uint16) =
75+
proc renderTextLine(fb: ptr UncheckedArray[uint16],
76+
font: ptr stbtt_fontinfo,
77+
text: string, y: int,
78+
pixelHeight: float, color: uint16) =
6879
let scale = stbtt_ScaleForPixelHeight(font, cfloat(pixelHeight))
69-
7080
var ascent, descent, lineGap: cint
7181
stbtt_GetFontVMetrics(font, addr ascent, addr descent, addr lineGap)
7282

7383
let textWidth = measureText(font, text, scale)
7484
var x = (ScreenWidth - textWidth) div 2
75-
let baseline = y + int(cfloat(ascent) * scale)
76-
77-
var cr, cg, cb: uint8
78-
fromRGB565(color, cr, cg, cb)
85+
let baseline = y + int(cfloat(ascent) * scale)
7986

8087
var bitmapBuf = newSeqUninit[byte](64 * 64)
8188

@@ -105,31 +112,26 @@ proc renderTextLine(fb: ptr UncheckedArray[uint16], font: ptr stbtt_fontinfo,
105112
let fbBase = FbXBase[px]
106113

107114
for by in 0..<h:
108-
let alpha = bitmapBuf[by * w + bx]
115+
let alpha = uint32(bitmapBuf[by * w + bx])
109116
if alpha == 0: continue
110117
let py = charY + by
111118
if py < 0 or py >= ScreenHeight: continue
112119

113-
if alpha == 255:
114-
fb[fbBase + py] = color
120+
let idx = fbBase + uint32(py)
121+
if alpha >= 255:
122+
fb[idx] = color
115123
else:
116-
let bgPixel = fb[fbBase + py]
117-
var bgR, bgG, bgB: uint8
118-
fromRGB565(bgPixel, bgR, bgG, bgB)
119-
let inv = 255'u32 - uint32(alpha)
120-
let newR = uint8((uint32(cr) * uint32(alpha) + uint32(bgR) * inv) div 255)
121-
let newG = uint8((uint32(cg) * uint32(alpha) + uint32(bgG) * inv) div 255)
122-
let newB = uint8((uint32(cb) * uint32(alpha) + uint32(bgB) * inv) div 255)
123-
fb[fbBase + py] = toRGB565(newR, newG, newB)
124+
fb[idx] = blendRGB565(color, fb[idx], alpha)
124125
{.pop.}
125126

126127
var advanceWidth, leftSideBearing: cint
127-
stbtt_GetCodepointHMetrics(font, cint(ch), addr advanceWidth, addr leftSideBearing)
128+
stbtt_GetCodepointHMetrics(font, cint(ch), addr advanceWidth,
129+
addr leftSideBearing)
128130
x += int(cfloat(advanceWidth) * scale)
129131

130132
if i < text.len - 1:
131-
let kern = stbtt_GetCodepointKernAdvance(font, cint(ch), cint(text[i+1]))
132-
x += int(cfloat(kern) * scale)
133+
x += int(cfloat(stbtt_GetCodepointKernAdvance(
134+
font, cint(ch), cint(text[i + 1]))) * scale)
133135

134136
proc display*(text: string,
135137
backgroundPath: string = DefaultBackground,
@@ -156,23 +158,27 @@ proc display*(text: string,
156158
let fb = cast[ptr UncheckedArray[uint16]](fbMap)
157159

158160
if not fileExists(backgroundPath):
159-
raise newException(IOError, "display: background file not found: " & backgroundPath)
161+
raise newException(IOError,
162+
"display: background file not found: " & backgroundPath)
160163

161164
let png = loadPNG32(backgroundPath)
162165

163166
zeroMem(fbMap, FbSize)
164167

165-
var srcIdx = 0
166-
for ly in 0..<min(png.height, ScreenHeight):
167-
for lx in 0..<min(png.width, ScreenWidth):
168-
let r = png.data[srcIdx]
169-
let g = png.data[srcIdx + 1]
170-
let b = png.data[srcIdx + 2]
171-
srcIdx += 4
172-
fb[FbXBase[lx] + ly] = toRGB565(r.uint8, g.uint8, b.uint8)
168+
# commence claude's NEON fuckery
169+
rgba_to_rgb565(
170+
cast[ptr UncheckedArray[uint8]](unsafeAddr png.data[0]),
171+
cast[ptr UncheckedArray[uint16]](addr blitTmp[0]),
172+
cint(min(png.width * png.height, FbPixels)))
173+
174+
blit_transposed(
175+
cast[ptr UncheckedArray[uint16]](addr blitTmp[0]),
176+
cast[ptr UncheckedArray[uint16]](fbMap),
177+
cint(ScreenWidth), cint(ScreenHeight))
173178

174179
if not fileExists(fontPath):
175-
raise newException(IOError, "display: font file not found: " & fontPath)
180+
raise newException(IOError,
181+
"display: font file not found: " & fontPath)
176182

177183
let fontData = loadFont(fontPath)
178184
var fontInfo: stbtt_fontinfo
@@ -182,13 +188,12 @@ proc display*(text: string,
182188

183189
let scale = stbtt_ScaleForPixelHeight(addr fontInfo, cfloat(FontSize))
184190
let lines = wrapText(text, addr fontInfo, scale, ScreenWidth - 40)
185-
186191
let lineHeight = int(FontSize * 1.2)
187192
var startY = (ScreenHeight - lines.len * lineHeight) div 2
188193

189194
for line in lines:
190195
renderTextLine(fb, addr fontInfo, line, startY, FontSize,
191-
toRGB565(255, 255, 255))
196+
toRGB565(255, 255, 255))
192197
startY += lineHeight
193198

194199
if duration == 0:
@@ -216,7 +221,7 @@ proc showUsage(progName: string) =
216221
stderr.writeLine(" -b Background PNG image (default: quarkbg.png)")
217222
stderr.writeLine(" -d Display duration in milliseconds (default: 0 = forever)")
218223
stderr.writeLine(" -f Font file path (default: TwCenMT.ttf)")
219-
stderr.writeLine(" -p Don't fork into the background (only applies if duration is 0)")
224+
stderr.writeLine(" -p Don't fork into background (only applies if duration is 0)")
220225

221226
proc main() =
222227
var
@@ -256,10 +261,10 @@ proc main() =
256261
if i + 1 <= paramCount():
257262
let durationStr = paramStr(i + 1).strip()
258263
if durationStr.len > 0:
259-
try:
260-
duration = parseInt(durationStr)
264+
try: duration = parseInt(durationStr)
261265
except ValueError:
262-
stderr.writeLine("display: invalid duration value: '" & durationStr & "'"); quit(1)
266+
stderr.writeLine("display: invalid duration: '" & durationStr & "'")
267+
quit(1)
263268
inc i
264269
else:
265270
stderr.writeLine("display: -d requires a duration argument")

0 commit comments

Comments
 (0)