11#include "qimage_metrics.h"
22
3+ #include <stdlib.h>
4+
5+ #if defined(_M_X64 ) || defined(_M_IX86 ) || defined(__x86_64__ ) || defined(__i386__ )
6+ #include <emmintrin.h>
7+ #define QM_HAS_SSE2 1
8+ #else
9+ #define QM_HAS_SSE2 0
10+ #endif
11+
12+ #if QM_HAS_SSE2
13+ static qu64 psnr_row_sse2 (qu8 * a , qu8 * b , int n )
14+ {
15+ qu64 sum ;
16+ qu32 lane [4 ];
17+ __m128i acc = _mm_setzero_si128 ();
18+ __m128i zero = _mm_setzero_si128 ();
19+ int j = 0 ;
20+
21+ for (; j + 16 <= n ; j += 16 ) {
22+ __m128i va = _mm_loadu_si128 ((const __m128i * )(a + j ));
23+ __m128i vb = _mm_loadu_si128 ((const __m128i * )(b + j ));
24+ __m128i a_lo = _mm_unpacklo_epi8 (va , zero );
25+ __m128i a_hi = _mm_unpackhi_epi8 (va , zero );
26+ __m128i b_lo = _mm_unpacklo_epi8 (vb , zero );
27+ __m128i b_hi = _mm_unpackhi_epi8 (vb , zero );
28+ __m128i d_lo = _mm_sub_epi16 (a_lo , b_lo );
29+ __m128i d_hi = _mm_sub_epi16 (a_hi , b_hi );
30+ // _mm_madd_epi16(x, x) widens int16 to int32 and pairs them.
31+ // Per-row max: (W/16) * 8 lanes * (2 * 255^2) ~ 4.16M * (W/16);
32+ // for W <= 8192 this is well under 2^31, so 32-bit accum is safe.
33+ acc = _mm_add_epi32 (acc , _mm_add_epi32 (
34+ _mm_madd_epi16 (d_lo , d_lo ),
35+ _mm_madd_epi16 (d_hi , d_hi )));
36+ }
37+
38+ _mm_storeu_si128 ((__m128i * )lane , acc );
39+ sum = (qu64 )lane [0 ] + lane [1 ] + lane [2 ] + lane [3 ];
40+
41+ for (; j < n ; j ++ ) {
42+ long d = a [j ] - b [j ];
43+ sum += (qu64 )(d * d );
44+ }
45+ return sum ;
46+ }
47+ #endif
48+
349double qPSNR (qu8 * src , qu8 * dst , int w , int h , int stride , int px_w )
450{
551 int i , j ;
652 qu64 sum_dxd = 0 ;
753 double MSE , PMSE ;
8- const int MAXi = 255 ;
9- int margin = (stride - w ) * px_w ;
54+ const int MAXi = 255 ;
55+
56+ #if QM_HAS_SSE2
57+ if (px_w == 1 ) {
58+ for (i = 0 ; i < h ; i ++ ) {
59+ sum_dxd += psnr_row_sse2 (src , dst , w );
60+ src += stride ;
61+ dst += stride ;
62+ }
63+ MSE = (double )sum_dxd / ((double )w * h );
64+ PMSE = MSE / (MAXi * MAXi );
65+ return -10 * log10 (PMSE );
66+ }
67+ #endif
1068
11- for (i = 0 ; i < h ; i ++ ) {
12- for (j = 0 ; j < w ; j ++ ) {
13- long d = * src - * dst ;
69+ {
70+ int margin = (stride - w ) * px_w ;
1471
15- src += px_w ;
16- dst += px_w ;
72+ for (i = 0 ; i < h ; i ++ ) {
73+ for (j = 0 ; j < w ; j ++ ) {
74+ long d = * src - * dst ;
1775
18- sum_dxd += ( qu64 )( d * d ) ;
19- }
76+ src += px_w ;
77+ dst += px_w ;
2078
21- src += margin ;
22- dst += margin ;
79+ sum_dxd += (qu64 )(d * d );
80+ }
81+
82+ src += margin ;
83+ dst += margin ;
84+ }
2385 }
2486
2587 MSE = (double )sum_dxd / ((double )w * h );
@@ -36,109 +98,111 @@ double qPSNR(qu8 *src, qu8 *dst, int w, int h, int stride, int px_w)
3698
3799static const double window_size_reciprocal = 1 / (double )WINDOW_SIZE ;
38100
39- static double ssim_average (qu8 * src , int real_w , int px_w )
101+ // Integral-image (box-filter) reformulation of the naive sliding-window SSIM.
102+ // The previous implementation rescanned each 8x8 window five times, making
103+ // it O(W*H*64); this version keeps running column sums of s, d, s*s, d*d, s*d
104+ // over the last SIZE_N rows and slides them horizontally to evaluate each
105+ // window in O(1), yielding O(W*H) overall. Algebraically the variance and
106+ // covariance use Sx^2/N - mu^2 instead of Sigma(x - mu)^2 / N: mathematically
107+ // identical, may differ from the old code in the last few ULPs of double
108+ // rounding.
109+ double qSSIM (qu8 * src , qu8 * dst , int w , int h , int stride , int px_w )
40110{
41- int total = 0 ;
42- int i , j ;
43- double average ;
44-
45- for (i = 0 ; i < SIZE_N ; i ++ ) {
46- for (j = 0 ; j < SIZE_N ; j ++ )
47- total += src [j * px_w ];
111+ const double c1 = K1 * L * K1 * L ;
112+ const double c2 = K2 * L * K2 * L ;
48113
49- src += real_w ;
50- }
114+ int real_w = stride * px_w ;
115+ int nW = w - SIZE_N + 1 ;
116+ int nH = h - SIZE_N + 1 ;
117+ int r , c , j ;
51118
52- average = total * window_size_reciprocal ;
119+ qu32 * col_s , * col_d , * col_ss , * col_dd , * col_sd ;
120+ double total_ssim = 0.0 ;
53121
54- return average ;
55- }
122+ if ( nW <= 0 || nH <= 0 )
123+ return 0.0 ;
56124
57- static double ssim_variance (qu8 * src , int real_w , double average , int px_w )
58- {
59- int i , j ;
60- double t , sum_square = 0 , variance ;
125+ col_s = (qu32 * )malloc (sizeof (qu32 ) * w );
126+ col_d = (qu32 * )malloc (sizeof (qu32 ) * w );
127+ col_ss = (qu32 * )malloc (sizeof (qu32 ) * w );
128+ col_dd = (qu32 * )malloc (sizeof (qu32 ) * w );
129+ col_sd = (qu32 * )malloc (sizeof (qu32 ) * w );
130+ if (!col_s || !col_d || !col_ss || !col_dd || !col_sd ) {
131+ free (col_s ); free (col_d ); free (col_ss ); free (col_dd ); free (col_sd );
132+ return 0.0 ;
133+ }
61134
62- for (i = 0 ; i < SIZE_N ; i ++ ) {
63- for (j = 0 ; j < SIZE_N ; j ++ ) {
64- t = src [j * px_w ] - average ;
65- sum_square += t * t ;
135+ // Seed column sums over rows [0, SIZE_N).
136+ for (j = 0 ; j < w ; j ++ ) {
137+ col_s [j ] = 0 ; col_d [j ] = 0 ;
138+ col_ss [j ] = 0 ; col_dd [j ] = 0 ;
139+ col_sd [j ] = 0 ;
140+ }
141+ for (r = 0 ; r < SIZE_N ; r ++ ) {
142+ qu8 * sp = src + r * real_w ;
143+ qu8 * dp = dst + r * real_w ;
144+ for (j = 0 ; j < w ; j ++ ) {
145+ qu32 s = sp [j * px_w ];
146+ qu32 d = dp [j * px_w ];
147+ col_s [j ] += s ;
148+ col_d [j ] += d ;
149+ col_ss [j ] += s * s ;
150+ col_dd [j ] += d * d ;
151+ col_sd [j ] += s * d ;
66152 }
67- src += real_w ;
68153 }
69154
70- variance = sum_square * window_size_reciprocal ;
155+ for (r = 0 ; r < nH ; r ++ ) {
156+ qu64 win_s = 0 , win_d = 0 , win_ss = 0 , win_dd = 0 , win_sd = 0 ;
71157
72- return variance ;
73- }
74-
75- static double ssim_covariance (qu8 * src , qu8 * dst , int real_w ,
76- double s_average , double d_average , int px_w )
77- {
78- int i , j ;
79- double total = 0 , covariance ;
80-
81- for (i = 0 ; i < SIZE_N ; i ++ ) {
82158 for (j = 0 ; j < SIZE_N ; j ++ ) {
83- int idx = j * px_w ;
84-
85- total += (src [idx ] - s_average ) * (dst [idx ] - d_average );
159+ win_s += col_s [j ];
160+ win_d += col_d [j ];
161+ win_ss += col_ss [j ];
162+ win_dd += col_dd [j ];
163+ win_sd += col_sd [j ];
86164 }
87165
88- src += real_w ;
89- dst += real_w ;
90- }
91-
92- covariance = total * window_size_reciprocal ;
93-
94- return covariance ;
95- }
96-
97- static double ssim_window (qu8 * src , qu8 * dst , int real_w , int px_w )
98- {
99- const double c1 = K1 * L * K1 * L ;
100- const double c2 = K2 * L * K2 * L ;
101-
102- double u1 = ssim_average (src , real_w , px_w );
103- double u2 = ssim_average (dst , real_w , px_w );
104-
105- double a1 = ssim_variance (src , real_w , u1 , px_w );
106- double a2 = ssim_variance (dst , real_w , u2 , px_w );
107-
108- double covariance = ssim_covariance (src , dst , real_w , u1 , u2 , px_w );
109-
110- double ssim = ((2 * u1 * u2 + c1 ) * (2 * covariance + c2 )) /
111- ((u1 * u1 + u2 * u2 + c1 ) * (a1 + a2 + c2 ));
112-
113- return ssim ;
114- }
115-
116- double qSSIM (qu8 * src , qu8 * dst , int w , int h , int stride , int px_w )
117- {
118- double mean_ssim , total_ssim = 0 ;
119- int real_w = stride * px_w ;
120- int i , j ;
121- int nW = w - SIZE_N + 1 ;
122- int nH = h - SIZE_N + 1 ;
123-
124- if (nW <= 0 || nH <= 0 )
125- return 0.0 ;
126-
127- // SIZE_N x SIZE_N linear sliding window
128- // My implementation of SSIM is certainly inefficient,
129- // but it gives values and is easy to understand.
130- for (i = 0 ; i < nH ; i ++ ) {
131- qu8 * src_t , * dst_t ;
132- int h_hop = real_w * i ;
133-
134- src_t = src + h_hop ;
135- dst_t = dst + h_hop ;
166+ for (c = 0 ; c < nW ; c ++ ) {
167+ double u1 = (double )win_s * window_size_reciprocal ;
168+ double u2 = (double )win_d * window_size_reciprocal ;
169+ double a1 = (double )win_ss * window_size_reciprocal - u1 * u1 ;
170+ double a2 = (double )win_dd * window_size_reciprocal - u2 * u2 ;
171+ double cov = (double )win_sd * window_size_reciprocal - u1 * u2 ;
172+
173+ double ssim = ((2 * u1 * u2 + c1 ) * (2 * cov + c2 )) /
174+ ((u1 * u1 + u2 * u2 + c1 ) * (a1 + a2 + c2 ));
175+ total_ssim += ssim ;
176+
177+ if (c + SIZE_N < w ) {
178+ win_s += col_s [c + SIZE_N ] - col_s [c ];
179+ win_d += col_d [c + SIZE_N ] - col_d [c ];
180+ win_ss += col_ss [c + SIZE_N ] - col_ss [c ];
181+ win_dd += col_dd [c + SIZE_N ] - col_dd [c ];
182+ win_sd += col_sd [c + SIZE_N ] - col_sd [c ];
183+ }
184+ }
136185
137- for (j = 0 ; j < nW ; j ++ , src_t += px_w , dst_t += px_w )
138- total_ssim += ssim_window (src_t , dst_t , real_w , px_w );
186+ if (r + 1 < nH ) {
187+ qu8 * sp_old = src + r * real_w ;
188+ qu8 * dp_old = dst + r * real_w ;
189+ qu8 * sp_new = src + (r + SIZE_N ) * real_w ;
190+ qu8 * dp_new = dst + (r + SIZE_N ) * real_w ;
191+ for (j = 0 ; j < w ; j ++ ) {
192+ qu32 s_old = sp_old [j * px_w ];
193+ qu32 d_old = dp_old [j * px_w ];
194+ qu32 s_new = sp_new [j * px_w ];
195+ qu32 d_new = dp_new [j * px_w ];
196+ col_s [j ] += s_new - s_old ;
197+ col_d [j ] += d_new - d_old ;
198+ col_ss [j ] += s_new * s_new - s_old * s_old ;
199+ col_dd [j ] += d_new * d_new - d_old * d_old ;
200+ col_sd [j ] += s_new * d_new - s_old * d_old ;
201+ }
202+ }
139203 }
140204
141- mean_ssim = total_ssim / ( nW * nH );
205+ free ( col_s ); free ( col_d ); free ( col_ss ); free ( col_dd ); free ( col_sd );
142206
143- return mean_ssim ;
207+ return total_ssim / ( nW * nH ) ;
144208}
0 commit comments