Skip to content

Commit b4bc774

Browse files
committed
support loading float32 and converting to fp16 train data in task1
1 parent c50d27f commit b4bc774

9 files changed

Lines changed: 98 additions & 24 deletions

File tree

cpp/deglib/include/distance/fp16.h

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -59,28 +59,32 @@ static inline uint16_t float_to_fp16(float f) {
5959
#endif
6060
}
6161

62-
static inline std::vector<uint16_t> floats_to_fp16(const std::vector<float>& v) {
63-
std::vector<uint16_t> out(v.size());
62+
static inline void floats_to_fp16(const float* src, uint16_t* dst, size_t size) {
6463
#if defined(USE_AVX512) || defined(USE_AVX) || defined(USE_SSE)
6564
size_t i = 0;
66-
for (; i + 4 <= v.size(); i += 4) {
67-
__m128 f4 = _mm_loadu_ps(&v[i]);
65+
for (; i + 4 <= size; i += 4) {
66+
__m128 f4 = _mm_loadu_ps(&src[i]);
6867
__m128i h4 = _mm_cvtps_ph(f4, _MM_FROUND_TO_NEAREST_INT);
6968
alignas(16) uint16_t tmp[8];
7069
_mm_storeu_si128((__m128i*)tmp, h4);
71-
out[i] = tmp[0];
72-
out[i+1] = tmp[1];
73-
out[i+2] = tmp[2];
74-
out[i+3] = tmp[3];
70+
dst[i] = tmp[0];
71+
dst[i+1] = tmp[1];
72+
dst[i+2] = tmp[2];
73+
dst[i+3] = tmp[3];
7574
}
76-
for (; i < v.size(); ++i) {
77-
out[i] = float_to_fp16(v[i]);
75+
for (; i < size; ++i) {
76+
dst[i] = float_to_fp16(src[i]);
7877
}
7978
#else
80-
for (size_t i = 0; i < v.size(); ++i) {
81-
out[i] = float_to_fp16(v[i]);
79+
for (size_t i = 0; i < size; ++i) {
80+
dst[i] = float_to_fp16(src[i]);
8281
}
8382
#endif
83+
}
84+
85+
static inline std::vector<uint16_t> floats_to_fp16(const std::vector<float>& v) {
86+
std::vector<uint16_t> out(v.size());
87+
floats_to_fp16(v.data(), out.data(), v.size());
8488
return out;
8589
}
8690

cpp/deglib/include/quantization/evp_quantize.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -298,14 +298,14 @@ inline std::vector<std::byte> quantize_batch(const uint16_t* data, size_t count,
298298
* @return std::vector<std::byte> with count * 2 * dim/8 bytes
299299
* @throws std::invalid_argument if dim % 8 != 0 or non_zeros >= dim
300300
*/
301-
inline std::vector<std::byte> quantize_batch(const std::vector<std::vector<std::byte>>& data, uint32_t dim,
302-
uint32_t non_zeros, size_t numThreads = 0) {
301+
inline std::vector<std::byte> quantize_batch_fp16(const std::vector<std::vector<std::byte>>& data, uint32_t dim,
302+
uint32_t non_zeros, size_t numThreads = 0) {
303303
const size_t count = data.size();
304304
if (dim % 8 != 0) {
305-
throw std::invalid_argument("quantize_batch: dim must be divisible by 8, got " + std::to_string(dim));
305+
throw std::invalid_argument("quantize_batch_fp16: dim must be divisible by 8, got " + std::to_string(dim));
306306
}
307307
if (non_zeros >= dim) {
308-
throw std::invalid_argument("quantize_batch: non_zeros must be < dim");
308+
throw std::invalid_argument("quantize_batch_fp16: non_zeros must be < dim");
309309
}
310310

311311
const size_t mask_bytes = dim / 8;
@@ -375,4 +375,5 @@ inline std::vector<std::byte> quantize_batch(const std::vector<std::vector<std::
375375

376376
return result;
377377
}
378+
378379
} // namespace deglib::quantization

cpp/sisap/task1/mode1.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ static int run(const std::filesystem::path& data_path,
123123

124124
auto datasets = hdf5_reader::scan_datasets(h5path);
125125
auto& train_info = hdf5_reader::find_dataset(datasets, "train");
126-
127126
double t_load_start = sisap_common::now_ms();
128127
size_t dims = static_cast<size_t>(train_info.num_cols);
129128
size_t count = static_cast<size_t>(train_info.num_rows);
@@ -193,6 +192,17 @@ static int run(const std::filesystem::path& data_path,
193192
double t_chunk_load = sisap_common::now_ms();
194193
std::vector<std::vector<std::byte>> chunk_vectors =
195194
hdf5_reader::read_matrix_bytes(h5path, train_info, start_row, current_chunk_size);
195+
if (train_info.element_size == 4) {
196+
for (auto& vec : chunk_vectors) {
197+
std::vector<std::byte> fp16_vec(dims * 2);
198+
deglib::distances::floats_to_fp16(
199+
reinterpret_cast<const float*>(vec.data()),
200+
reinterpret_cast<uint16_t*>(fp16_vec.data()),
201+
dims
202+
);
203+
vec = std::move(fp16_vec);
204+
}
205+
}
196206
double chunk_load_ms = sisap_common::now_ms() - t_chunk_load;
197207
total_load_ms += chunk_load_ms;
198208

cpp/sisap/task1/mode2.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,17 @@ static int run(const std::filesystem::path& data_path,
6464
std::vector<std::vector<std::byte>> train_vectors = hdf5_reader::read_matrix_bytes(h5path, train_info);
6565
size_t dims = static_cast<size_t>(train_info.num_cols);
6666
size_t count = static_cast<size_t>(train_info.num_rows);
67+
if (train_info.element_size == 4) {
68+
for (auto& vec : train_vectors) {
69+
std::vector<std::byte> fp16_vec(dims * 2);
70+
deglib::distances::floats_to_fp16(
71+
reinterpret_cast<const float*>(vec.data()),
72+
reinterpret_cast<uint16_t*>(fp16_vec.data()),
73+
dims
74+
);
75+
vec = std::move(fp16_vec);
76+
}
77+
}
6778

6879
std::vector<std::vector<int32_t>> gt_data;
6980
if (compute_recall) {
@@ -82,7 +93,7 @@ static int run(const std::filesystem::path& data_path,
8293
// --------------------------------------------------------------------------
8394
double t1 = sisap_common::now_ms();
8495

85-
auto quantized = deglib::quantization::quantize_batch(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
96+
auto quantized = deglib::quantization::quantize_batch_fp16(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
8697

8798
double quantize_ms = sisap_common::now_ms() - t1;
8899

cpp/sisap/task1/mode3.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ static int run(const std::filesystem::path& data_path,
120120
const std::string h5path = data_path.string();
121121
auto datasets = hdf5_reader::scan_datasets(h5path);
122122
auto& train_info = hdf5_reader::find_dataset(datasets, "train");
123-
124123
double t_load = sisap_common::now_ms();
125124
size_t dims = static_cast<size_t>(train_info.num_cols);
126125
size_t count = static_cast<size_t>(train_info.num_rows);
@@ -194,12 +193,23 @@ static int run(const std::filesystem::path& data_path,
194193
double t_chunk_load = sisap_common::now_ms();
195194
std::vector<std::vector<std::byte>> chunk_vectors =
196195
hdf5_reader::read_matrix_bytes(h5path, train_info, start_row, current_chunk_size);
196+
if (train_info.element_size == 4) {
197+
for (auto& vec : chunk_vectors) {
198+
std::vector<std::byte> fp16_vec(dims * 2);
199+
deglib::distances::floats_to_fp16(
200+
reinterpret_cast<const float*>(vec.data()),
201+
reinterpret_cast<uint16_t*>(fp16_vec.data()),
202+
dims
203+
);
204+
vec = std::move(fp16_vec);
205+
}
206+
}
197207
double chunk_load_ms = sisap_common::now_ms() - t_chunk_load;
198208
load_ms += chunk_load_ms;
199209

200210
// Quantize chunk
201211
double t_chunk_quant = sisap_common::now_ms();
202-
auto quantized = deglib::quantization::quantize_batch(chunk_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
212+
auto quantized = deglib::quantization::quantize_batch_fp16(chunk_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
203213
double chunk_quantize_ms = sisap_common::now_ms() - t_chunk_quant;
204214
quantize_ms += chunk_quantize_ms;
205215

cpp/sisap/task1/mode4.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ static ExplorationTimings run_exploration(const deglib::search::SearchGraph& gra
6464
const std::vector<std::vector<int32_t>>& gt_data,
6565
const std::string& output_path,
6666
double build_time_s) {
67+
6768
size_t count = graph.size();
6869
deglib::FloatSpace fp16_rerank_space(static_cast<uint32_t>(dims), deglib::Metric::FP16InnerProduct);
6970
std::vector<std::vector<uint32_t>> results(count);
@@ -193,6 +194,17 @@ static int run(const std::filesystem::path& data_path,
193194
std::vector<std::vector<std::byte>> train_vectors = hdf5_reader::read_matrix_bytes(h5path, train_info);
194195
size_t dims = static_cast<size_t>(train_info.num_cols);
195196
size_t count = static_cast<size_t>(train_info.num_rows);
197+
if (train_info.element_size == 4) {
198+
for (auto& vec : train_vectors) {
199+
std::vector<std::byte> fp16_vec(dims * 2);
200+
deglib::distances::floats_to_fp16(
201+
reinterpret_cast<const float*>(vec.data()),
202+
reinterpret_cast<uint16_t*>(fp16_vec.data()),
203+
dims
204+
);
205+
vec = std::move(fp16_vec);
206+
}
207+
}
196208

197209
std::vector<std::vector<int32_t>> gt_data;
198210
if (compute_recall) {
@@ -245,7 +257,7 @@ static int run(const std::filesystem::path& data_path,
245257

246258
// Quantize all upfront once (highly optimized, parallel, zero raw copy)
247259
double t1 = sisap_common::now_ms();
248-
auto quantized = deglib::quantization::quantize_batch(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
260+
auto quantized = deglib::quantization::quantize_batch_fp16(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
249261
quantize_ms = sisap_common::now_ms() - t1;
250262

251263
graph_ptr = std::make_unique<deglib::graph::SizeBoundedGraph>(static_cast<uint32_t>(count), k_graph, feature_space);

cpp/sisap/task1/mode5.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,13 @@ static int run(const std::filesystem::path& data_path,
135135
double t_load = sisap_common::now_ms();
136136
size_t dims = static_cast<size_t>(train_info.num_cols);
137137
size_t count = static_cast<size_t>(train_info.num_rows);
138-
std::vector<uint16_t> fp16_data = hdf5_reader::read_flat_uint16(h5path, train_info);
138+
std::vector<uint16_t> fp16_data;
139+
if (train_info.element_size == 4) {
140+
std::vector<float> fp32_data = hdf5_reader::read_flat_fp32(h5path, train_info);
141+
fp16_data = deglib::distances::floats_to_fp16(fp32_data);
142+
} else {
143+
fp16_data = hdf5_reader::read_flat_uint16(h5path, train_info);
144+
}
139145

140146
std::vector<std::vector<int32_t>> gt_data;
141147
if (run_recall) {

cpp/sisap/task1/mode6.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,17 @@ static int run(const std::filesystem::path& data_path,
136136
std::vector<std::vector<std::byte>> train_vectors = hdf5_reader::read_matrix_bytes(h5path, train_info);
137137
size_t dims = static_cast<size_t>(train_info.num_cols);
138138
size_t count = static_cast<size_t>(train_info.num_rows);
139+
if (train_info.element_size == 4) {
140+
for (auto& vec : train_vectors) {
141+
std::vector<std::byte> fp16_vec(dims * 2);
142+
deglib::distances::floats_to_fp16(
143+
reinterpret_cast<const float*>(vec.data()),
144+
reinterpret_cast<uint16_t*>(fp16_vec.data()),
145+
dims
146+
);
147+
vec = std::move(fp16_vec);
148+
}
149+
}
139150

140151
std::vector<std::vector<int32_t>> gt_data;
141152
if (compute_recall) {
@@ -188,7 +199,7 @@ static int run(const std::filesystem::path& data_path,
188199

189200
// Quantize all upfront once (highly optimized, parallel, zero raw copy)
190201
double t1 = sisap_common::now_ms();
191-
auto quantized = deglib::quantization::quantize_batch(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
202+
auto quantized = deglib::quantization::quantize_batch_fp16(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
192203
quantize_ms = sisap_common::now_ms() - t1;
193204

194205
graph_ptr = std::make_unique<deglib::graph::SizeBoundedGraph>(static_cast<uint32_t>(count), k_graph, feature_space);

cpp/sisap/task1/mode7.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,15 @@ static int run(const std::filesystem::path& data_path,
195195
std::vector<std::vector<std::byte>> train_vectors = hdf5_reader::read_matrix_bytes(h5path, train_info);
196196
size_t dims = static_cast<size_t>(train_info.num_cols);
197197
size_t count = static_cast<size_t>(train_info.num_rows);
198+
if (train_info.element_size == 4) {
199+
for (auto& vec : train_vectors) {
200+
const float* fp32_ptr = reinterpret_cast<const float*>(vec.data());
201+
std::vector<std::byte> fp16_vec(dims * 2);
202+
uint16_t* fp16_ptr = reinterpret_cast<uint16_t*>(fp16_vec.data());
203+
deglib::distances::floats_to_fp16(fp32_ptr, fp16_ptr, dims);
204+
vec = std::move(fp16_vec);
205+
}
206+
}
198207

199208
std::vector<std::vector<int32_t>> gt_data;
200209
if (compute_recall) {
@@ -247,7 +256,7 @@ static int run(const std::filesystem::path& data_path,
247256

248257
// Quantize all upfront once (highly optimized, parallel, zero raw copy)
249258
double t1 = sisap_common::now_ms();
250-
auto quantized = deglib::quantization::quantize_batch(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
259+
auto quantized = deglib::quantization::quantize_batch_fp16(train_vectors, static_cast<uint32_t>(dims), non_zeros, threads);
251260
quantize_ms = sisap_common::now_ms() - t1;
252261

253262
graph_ptr = std::make_unique<deglib::graph::SizeBoundedGraph>(static_cast<uint32_t>(count), k_graph, feature_space);

0 commit comments

Comments
 (0)