Skip to content

Commit 53edbd8

Browse files
chammoruclaude
andcommitted
QVisionCore: speed up PSNR/SSIM with SSE2 and integral-image box filter (#10)
Fix qSSIM horizontal window sliding: the update win_x += col_x[c + SIZE_N] - col_x[c] computed the column-sum difference as qu32, which wrapped to a large positive value when the new column had a smaller sum than the leaving column. That wrapped value was then zero-extended into the qu64 accumulator, inflating win_x by ~2^32 and corrupting SSIM (sometimes producing scores greater than 1). Fix by evaluating the subtraction in qu64 order: win_x = win_x + col_x[c + SIZE_N] - col_x[c]. Add a regression test: identical bright-left / dark-right images must score exactly 1.0 regardless of content, which the old code failed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8ef41f7 commit 53edbd8

2 files changed

Lines changed: 179 additions & 99 deletions

File tree

QVisionCore/qimage_metrics.c

Lines changed: 163 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,87 @@
11
#include "qimage_metrics.h"
22

3+
#include <stdlib.h>
4+
5+
#if defined(_M_X64) || defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
6+
#include <emmintrin.h>
7+
#define QM_HAS_SSE2 1
8+
#else
9+
#define QM_HAS_SSE2 0
10+
#endif
11+
12+
#if QM_HAS_SSE2
13+
static qu64 psnr_row_sse2(qu8 *a, qu8 *b, int n)
14+
{
15+
qu64 sum;
16+
qu32 lane[4];
17+
__m128i acc = _mm_setzero_si128();
18+
__m128i zero = _mm_setzero_si128();
19+
int j = 0;
20+
21+
for (; j + 16 <= n; j += 16) {
22+
__m128i va = _mm_loadu_si128((const __m128i *)(a + j));
23+
__m128i vb = _mm_loadu_si128((const __m128i *)(b + j));
24+
__m128i a_lo = _mm_unpacklo_epi8(va, zero);
25+
__m128i a_hi = _mm_unpackhi_epi8(va, zero);
26+
__m128i b_lo = _mm_unpacklo_epi8(vb, zero);
27+
__m128i b_hi = _mm_unpackhi_epi8(vb, zero);
28+
__m128i d_lo = _mm_sub_epi16(a_lo, b_lo);
29+
__m128i d_hi = _mm_sub_epi16(a_hi, b_hi);
30+
// _mm_madd_epi16(x, x) widens int16 to int32 and pairs them.
31+
// Per-row max: (W/16) * 8 lanes * (2 * 255^2) ~ 4.16M * (W/16);
32+
// for W <= 8192 this is well under 2^31, so 32-bit accum is safe.
33+
acc = _mm_add_epi32(acc, _mm_add_epi32(
34+
_mm_madd_epi16(d_lo, d_lo),
35+
_mm_madd_epi16(d_hi, d_hi)));
36+
}
37+
38+
_mm_storeu_si128((__m128i *)lane, acc);
39+
sum = (qu64)lane[0] + lane[1] + lane[2] + lane[3];
40+
41+
for (; j < n; j++) {
42+
long d = a[j] - b[j];
43+
sum += (qu64)(d * d);
44+
}
45+
return sum;
46+
}
47+
#endif
48+
349
double qPSNR(qu8 *src, qu8 *dst, int w, int h, int stride, int px_w)
450
{
551
int i, j;
652
qu64 sum_dxd = 0;
753
double MSE, PMSE;
8-
const int MAXi = 255;
9-
int margin = (stride - w) * px_w;
54+
const int MAXi = 255;
55+
56+
#if QM_HAS_SSE2
57+
if (px_w == 1) {
58+
for (i = 0; i < h; i++) {
59+
sum_dxd += psnr_row_sse2(src, dst, w);
60+
src += stride;
61+
dst += stride;
62+
}
63+
MSE = (double)sum_dxd / ((double)w * h);
64+
PMSE = MSE / (MAXi * MAXi);
65+
return -10 * log10(PMSE);
66+
}
67+
#endif
1068

11-
for (i = 0; i < h; i++) {
12-
for (j = 0; j < w; j++) {
13-
long d = *src - *dst;
69+
{
70+
int margin = (stride - w) * px_w;
1471

15-
src += px_w;
16-
dst += px_w;
72+
for (i = 0; i < h; i++) {
73+
for (j = 0; j < w; j++) {
74+
long d = *src - *dst;
1775

18-
sum_dxd += (qu64)(d * d);
19-
}
76+
src += px_w;
77+
dst += px_w;
2078

21-
src += margin;
22-
dst += margin;
79+
sum_dxd += (qu64)(d * d);
80+
}
81+
82+
src += margin;
83+
dst += margin;
84+
}
2385
}
2486

2587
MSE = (double)sum_dxd / ((double)w * h);
@@ -36,109 +98,111 @@ double qPSNR(qu8 *src, qu8 *dst, int w, int h, int stride, int px_w)
3698

3799
static const double window_size_reciprocal = 1 / (double)WINDOW_SIZE;
38100

39-
static double ssim_average(qu8 *src, int real_w, int px_w)
101+
// Integral-image (box-filter) reformulation of the naive sliding-window SSIM.
102+
// The previous implementation rescanned each 8x8 window five times, making
103+
// it O(W*H*64); this version keeps running column sums of s, d, s*s, d*d, s*d
104+
// over the last SIZE_N rows and slides them horizontally to evaluate each
105+
// window in O(1), yielding O(W*H) overall. Algebraically the variance and
106+
// covariance use Sx^2/N - mu^2 instead of Sigma(x - mu)^2 / N: mathematically
107+
// identical, may differ from the old code in the last few ULPs of double
108+
// rounding.
109+
double qSSIM(qu8 *src, qu8 *dst, int w, int h, int stride, int px_w)
40110
{
41-
int total = 0;
42-
int i, j;
43-
double average;
44-
45-
for (i = 0; i < SIZE_N; i++) {
46-
for (j = 0; j < SIZE_N; j++)
47-
total += src[j * px_w];
111+
const double c1 = K1 * L * K1 * L;
112+
const double c2 = K2 * L * K2 * L;
48113

49-
src += real_w;
50-
}
114+
int real_w = stride * px_w;
115+
int nW = w - SIZE_N + 1;
116+
int nH = h - SIZE_N + 1;
117+
int r, c, j;
51118

52-
average = total * window_size_reciprocal;
119+
qu32 *col_s, *col_d, *col_ss, *col_dd, *col_sd;
120+
double total_ssim = 0.0;
53121

54-
return average;
55-
}
122+
if (nW <= 0 || nH <= 0)
123+
return 0.0;
56124

57-
static double ssim_variance(qu8 *src, int real_w, double average, int px_w)
58-
{
59-
int i, j;
60-
double t, sum_square = 0, variance;
125+
col_s = (qu32 *)malloc(sizeof(qu32) * w);
126+
col_d = (qu32 *)malloc(sizeof(qu32) * w);
127+
col_ss = (qu32 *)malloc(sizeof(qu32) * w);
128+
col_dd = (qu32 *)malloc(sizeof(qu32) * w);
129+
col_sd = (qu32 *)malloc(sizeof(qu32) * w);
130+
if (!col_s || !col_d || !col_ss || !col_dd || !col_sd) {
131+
free(col_s); free(col_d); free(col_ss); free(col_dd); free(col_sd);
132+
return 0.0;
133+
}
61134

62-
for (i = 0; i < SIZE_N; i++) {
63-
for (j = 0; j < SIZE_N; j++) {
64-
t = src[j * px_w] - average;
65-
sum_square += t * t;
135+
// Seed column sums over rows [0, SIZE_N).
136+
for (j = 0; j < w; j++) {
137+
col_s[j] = 0; col_d[j] = 0;
138+
col_ss[j] = 0; col_dd[j] = 0;
139+
col_sd[j] = 0;
140+
}
141+
for (r = 0; r < SIZE_N; r++) {
142+
qu8 *sp = src + r * real_w;
143+
qu8 *dp = dst + r * real_w;
144+
for (j = 0; j < w; j++) {
145+
qu32 s = sp[j * px_w];
146+
qu32 d = dp[j * px_w];
147+
col_s[j] += s;
148+
col_d[j] += d;
149+
col_ss[j] += s * s;
150+
col_dd[j] += d * d;
151+
col_sd[j] += s * d;
66152
}
67-
src += real_w;
68153
}
69154

70-
variance = sum_square * window_size_reciprocal;
155+
for (r = 0; r < nH; r++) {
156+
qu64 win_s = 0, win_d = 0, win_ss = 0, win_dd = 0, win_sd = 0;
71157

72-
return variance;
73-
}
74-
75-
static double ssim_covariance(qu8 *src, qu8 *dst, int real_w,
76-
double s_average, double d_average, int px_w)
77-
{
78-
int i, j;
79-
double total = 0, covariance;
80-
81-
for (i = 0; i < SIZE_N; i++) {
82158
for (j = 0; j < SIZE_N; j++) {
83-
int idx = j * px_w;
84-
85-
total += (src[idx] - s_average) * (dst[idx] - d_average);
159+
win_s += col_s[j];
160+
win_d += col_d[j];
161+
win_ss += col_ss[j];
162+
win_dd += col_dd[j];
163+
win_sd += col_sd[j];
86164
}
87165

88-
src += real_w;
89-
dst += real_w;
90-
}
91-
92-
covariance = total * window_size_reciprocal;
93-
94-
return covariance;
95-
}
96-
97-
static double ssim_window(qu8 *src, qu8 *dst, int real_w, int px_w)
98-
{
99-
const double c1 = K1 * L * K1 * L;
100-
const double c2 = K2 * L * K2 * L;
101-
102-
double u1 = ssim_average(src, real_w, px_w);
103-
double u2 = ssim_average(dst, real_w, px_w);
104-
105-
double a1 = ssim_variance(src, real_w, u1, px_w);
106-
double a2 = ssim_variance(dst, real_w, u2, px_w);
107-
108-
double covariance = ssim_covariance(src, dst, real_w, u1, u2, px_w);
109-
110-
double ssim = ((2 * u1 * u2 + c1) * (2 * covariance + c2)) /
111-
((u1 * u1 + u2 * u2 + c1) * (a1 + a2 + c2));
112-
113-
return ssim;
114-
}
115-
116-
double qSSIM(qu8 *src, qu8 *dst, int w, int h, int stride, int px_w)
117-
{
118-
double mean_ssim, total_ssim = 0;
119-
int real_w = stride * px_w;
120-
int i, j;
121-
int nW = w - SIZE_N + 1;
122-
int nH = h - SIZE_N + 1;
123-
124-
if (nW <= 0 || nH <= 0)
125-
return 0.0;
126-
127-
// SIZE_N x SIZE_N linear sliding window
128-
// My implementation of SSIM is certainly inefficient,
129-
// but it gives values and is easy to understand.
130-
for (i = 0; i < nH; i++) {
131-
qu8 *src_t, *dst_t;
132-
int h_hop = real_w * i;
133-
134-
src_t = src + h_hop;
135-
dst_t = dst + h_hop;
166+
for (c = 0; c < nW; c++) {
167+
double u1 = (double)win_s * window_size_reciprocal;
168+
double u2 = (double)win_d * window_size_reciprocal;
169+
double a1 = (double)win_ss * window_size_reciprocal - u1 * u1;
170+
double a2 = (double)win_dd * window_size_reciprocal - u2 * u2;
171+
double cov = (double)win_sd * window_size_reciprocal - u1 * u2;
172+
173+
double ssim = ((2 * u1 * u2 + c1) * (2 * cov + c2)) /
174+
((u1 * u1 + u2 * u2 + c1) * (a1 + a2 + c2));
175+
total_ssim += ssim;
176+
177+
if (c + SIZE_N < w) {
178+
win_s = win_s + col_s[c + SIZE_N] - col_s[c];
179+
win_d = win_d + col_d[c + SIZE_N] - col_d[c];
180+
win_ss = win_ss + col_ss[c + SIZE_N] - col_ss[c];
181+
win_dd = win_dd + col_dd[c + SIZE_N] - col_dd[c];
182+
win_sd = win_sd + col_sd[c + SIZE_N] - col_sd[c];
183+
}
184+
}
136185

137-
for (j = 0; j < nW; j++, src_t += px_w, dst_t += px_w)
138-
total_ssim += ssim_window(src_t, dst_t, real_w, px_w);
186+
if (r + 1 < nH) {
187+
qu8 *sp_old = src + r * real_w;
188+
qu8 *dp_old = dst + r * real_w;
189+
qu8 *sp_new = src + (r + SIZE_N) * real_w;
190+
qu8 *dp_new = dst + (r + SIZE_N) * real_w;
191+
for (j = 0; j < w; j++) {
192+
qu32 s_old = sp_old[j * px_w];
193+
qu32 d_old = dp_old[j * px_w];
194+
qu32 s_new = sp_new[j * px_w];
195+
qu32 d_new = dp_new[j * px_w];
196+
col_s[j] += s_new - s_old;
197+
col_d[j] += d_new - d_old;
198+
col_ss[j] += s_new * s_new - s_old * s_old;
199+
col_dd[j] += d_new * d_new - d_old * d_old;
200+
col_sd[j] += s_new * d_new - s_old * d_old;
201+
}
202+
}
139203
}
140204

141-
mean_ssim = total_ssim / (nW * nH);
205+
free(col_s); free(col_d); free(col_ss); free(col_dd); free(col_sd);
142206

143-
return mean_ssim;
207+
return total_ssim / (nW * nH);
144208
}

Tests/CoreRegressionTests.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,22 @@ static void test_metrics(void)
317317
check_near("SSIM constant reference",
318318
qSSIM(zero, ten, 8, 8, 8, 1),
319319
6.5025 / 106.5025, 0.000000001);
320+
321+
{
322+
/* When the window slides from a bright region into a dark region the
323+
* column sums decrease. The sliding update
324+
* win_x += col_x[c + SIZE_N] - col_x[c]
325+
* computed the difference in 32-bit unsigned, which wrapped to a large
326+
* positive value and inflated win_x, corrupting SSIM (sometimes > 1).
327+
* For identical src/dst the result must be exactly 1.0. */
328+
qu8 bright_dark[8 * 16];
329+
for (y = 0; y < 8; y++) {
330+
for (x = 0; x < 16; x++)
331+
bright_dark[y * 16 + x] = (x < 8) ? 255 : 0;
332+
}
333+
check_near("SSIM bright-to-dark identical images",
334+
qSSIM(bright_dark, bright_dark, 16, 8, 16, 1), 1.0, 0.000000001);
335+
}
320336
}
321337

322338
int main(void)

0 commit comments

Comments
 (0)