-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtile_hash.c
More file actions
185 lines (156 loc) · 6.14 KB
/
tile_hash.c
File metadata and controls
185 lines (156 loc) · 6.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#include "tile_hash.h"
#include "nearest_divisor.h"
#if defined(__ARM_NEON) && defined(__OPTIMIZE__)
#define USE_NEON 1
#else
#define USE_NEON 0
#endif
#if USE_NEON
#include <arm_neon.h>
#endif
#define TILE_W 64
typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
typedef uint32_t unaligned_u32 __attribute__((aligned(1)));
TileHashResult tile_hash_w64_scalar(const unsigned char *rowPtr,
int rows,
int bytesPerRow) {
uint64_t h0 = UINT64_C(0x517cc1b727220a95);
uint64_t h1 = UINT64_C(0x6c62272e07bb0142);
uint64_t h2 = UINT64_C(0x9e3779b97f4a7c15);
uint64_t h3 = UINT64_C(0xbf58476d1ce4e5b9);
for (int y = 0; y < rows; y++) {
const unsigned char *p = rowPtr;
for (int i = 0; i < 8; i++) {
h0 += *(const unaligned_u64 *)(p);
h1 += *(const unaligned_u64 *)(p + 8);
h2 += *(const unaligned_u64 *)(p + 16);
h3 += *(const unaligned_u64 *)(p + 24);
p += 32;
}
h0 ^= h2; h1 ^= h3;
h2 += h0; h3 += h1;
rowPtr += bytesPerRow;
}
h0 ^= h2; h1 ^= h3;
h0 ^= h0 >> 33; h0 *= UINT64_C(0xff51afd7ed558ccd); h0 ^= h0 >> 33;
h1 ^= h1 >> 29; h1 *= UINT64_C(0xc4ceb9fe1a85ec53); h1 ^= h1 >> 29;
TileHashResult result;
result.hashLo = (int64_t)h0;
result.hashHi = (int64_t)h1;
return result;
}
#if USE_NEON
TileHashResult tile_hash_w64_neon(const unsigned char *rowPtr,
int rows,
int bytesPerRow) {
uint64x2_t s0 = vcombine_u64(vcreate_u64(UINT64_C(0x517cc1b727220a95)),
vcreate_u64(UINT64_C(0x6c62272e07bb0142)));
uint64x2_t s1 = vcombine_u64(vcreate_u64(UINT64_C(0x9e3779b97f4a7c15)),
vcreate_u64(UINT64_C(0xbf58476d1ce4e5b9)));
for (int y = 0; y < rows; y++) {
const unsigned char *p = rowPtr;
for (int i = 0; i < 8; i++) {
s0 = vaddq_u64(s0, vld1q_u64((const uint64_t *)p));
s1 = vaddq_u64(s1, vld1q_u64((const uint64_t *)(p + 16)));
p += 32;
}
s0 = veorq_u64(s0, s1);
s1 = vaddq_u64(s1, s0);
rowPtr += bytesPerRow;
}
uint64_t h0 = vgetq_lane_u64(s0, 0);
uint64_t h1 = vgetq_lane_u64(s0, 1);
uint64_t h2 = vgetq_lane_u64(s1, 0);
uint64_t h3 = vgetq_lane_u64(s1, 1);
h0 ^= h2; h1 ^= h3;
h0 ^= h0 >> 33; h0 *= UINT64_C(0xff51afd7ed558ccd); h0 ^= h0 >> 33;
h1 ^= h1 >> 29; h1 *= UINT64_C(0xc4ceb9fe1a85ec53); h1 ^= h1 >> 29;
TileHashResult result;
result.hashLo = (int64_t)h0;
result.hashHi = (int64_t)h1;
return result;
}
#endif
static inline TileHashResult tile_hash_w64(const unsigned char *rowPtr,
int rows,
int bytesPerRow) {
#if USE_NEON
return tile_hash_w64_neon(rowPtr, rows, bytesPerRow);
#else
return tile_hash_w64_scalar(rowPtr, rows, bytesPerRow);
#endif
}
TileHashResult tile_hash(const void *data,
int startX, int startY,
int endX, int endY,
int bytesPerRow) {
const int byteWidth = (endX - startX) * 4;
const int quads = byteWidth >> 5;
const int remBytes = byteWidth & 31;
const int rem8 = remBytes >> 3;
const int tail = remBytes & 4;
const unsigned char *rowPtr = (const unsigned char *)data
+ (size_t)startY * bytesPerRow
+ (size_t)startX * 4;
uint64_t h0 = UINT64_C(0x517cc1b727220a95);
uint64_t h1 = UINT64_C(0x6c62272e07bb0142);
uint64_t h2 = UINT64_C(0x9e3779b97f4a7c15);
uint64_t h3 = UINT64_C(0xbf58476d1ce4e5b9);
for (int y = startY; y < endY; y++) {
const unsigned char *p = rowPtr;
for (int i = 0; i < quads; i++) {
h0 += *(const unaligned_u64 *)(p);
h1 += *(const unaligned_u64 *)(p + 8);
h2 += *(const unaligned_u64 *)(p + 16);
h3 += *(const unaligned_u64 *)(p + 24);
p += 32;
}
if (rem8 >= 1) h0 += *(const unaligned_u64 *)(p);
if (rem8 >= 2) h1 += *(const unaligned_u64 *)(p + 8);
if (rem8 >= 3) h2 += *(const unaligned_u64 *)(p + 16);
if (tail) h3 += (uint64_t)(*(const unaligned_u32 *)(p + rem8 * 8));
h0 ^= h2; h1 ^= h3;
h2 += h0; h3 += h1;
rowPtr += bytesPerRow;
}
h0 ^= h2; h1 ^= h3;
h0 ^= h0 >> 33; h0 *= UINT64_C(0xff51afd7ed558ccd); h0 ^= h0 >> 33;
h1 ^= h1 >> 29; h1 *= UINT64_C(0xc4ceb9fe1a85ec53); h1 ^= h1 >> 29;
TileHashResult result;
result.hashLo = (int64_t)h0;
result.hashHi = (int64_t)h1;
return result;
}
TileLayout tile_compute_layout(int imageWidth, int imageHeight) {
TileLayout layout;
layout.tileWidth = TILE_W;
layout.tileHeight = nearest_divisor(imageHeight, 22, 22, 44);
layout.columns = (imageWidth + TILE_W - 1) / TILE_W;
layout.rows = (imageHeight + layout.tileHeight - 1) / layout.tileHeight;
return layout;
}
void tile_compute_all(const void *data,
int imageWidth, int imageHeight,
int bytesPerRow,
TileLayout layout,
TileHashResult *out) {
const int fullCols = imageWidth / TILE_W;
int idx = 0;
for (int row = 0; row < layout.rows; row++) {
int startY = row * layout.tileHeight;
int tileRows = layout.tileHeight;
if (startY + tileRows > imageHeight) tileRows = imageHeight - startY;
for (int col = 0; col < fullCols; col++) {
const unsigned char *rowPtr = (const unsigned char *)data
+ (size_t)startY * bytesPerRow
+ (size_t)(col * TILE_W) * 4;
out[idx] = tile_hash_w64(rowPtr, tileRows, bytesPerRow);
idx++;
}
if (fullCols < layout.columns) {
int startX = fullCols * TILE_W;
out[idx] = tile_hash(data, startX, startY, imageWidth, startY + tileRows, bytesPerRow);
idx++;
}
}
}