Skip to content

Commit 1cef7a6

Browse files
refactor(arch): round 4 - injectable metrics, generic dispatch, and shallow cleanup
- Create OperationMetrics seam with RAII Scope to replace global static counters in Buffer and CountingAllocator (test isolation, zero-overhead opt-out via nullptr). - Extract generic resolve_best<Func>() SIMD runtime dispatcher so CPU detection logic is no longer coupled to add_arrays. - Document cache-line allocator vs SIMD-width allocator distinction in CONTEXT.md and cross-link both headers. - Delete untestable prefetch_* wrappers from memory_utils.hpp; use __builtin_prefetch directly in example source. - Mark ranges_utils.hpp as intentionally-shallow teaching module. - Archive completed architecture-deepening change. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 4bbc71b commit 1cef7a6

12 files changed

Lines changed: 455 additions & 219 deletions

File tree

CONTEXT.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,16 @@ ARM 平台:NEON (128-bit)
7777
### preset-driven build / 预设驱动构建
7878
使用 CMakePresets.json 统一构建配置,确保本地和 CI 环境一致。
7979

80+
### cache-line allocator / 缓存行对齐分配器
81+
对齐到缓存行边界(通常 64 字节)的内存分配器,用于消除伪共享。
82+
参见 `hpc::memory::AlignedAllocator<T, Alignment>`,对齐策略为**编译时常量**
83+
84+
### SIMD-width allocator / SIMD 宽度对齐分配器
85+
对齐到 SIMD 向量宽度边界(16/32/64 字节)的内存分配器,用于 SIMD 加载/存储操作。
86+
参见 `hpc::simd::AlignedAllocator<T>`,对齐策略为**运行时 CPU 特性检测**
87+
88+
**关键区别**:缓存行分配器服务于多线程并发优化(伪共享消除),SIMD 宽度分配器服务于单线程向量化(对齐加载)。两者独立存在,不共享实现。
89+
8090
### sanitizer / 消毒器
8191
运行时检测工具,发现未定义行为和内存错误:
8292
- **ASAN (AddressSanitizer)**: 检测内存错误(越界、释放后使用等)

examples/02-memory-cache/include/memory_utils.hpp

Lines changed: 9 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
* @file memory_utils.hpp
33
* @brief Memory and cache optimization utilities
44
*
5-
* This header provides utilities for memory alignment, cache-friendly
6-
* data structures, and performance measurement helpers.
5+
* This header provides utilities for memory alignment and cache-friendly
6+
* data structures.
77
*
88
* Validates:
99
* - Requirement 2.1: AOS vs SOA Comparison
1010
* - Requirement 2.2: False Sharing Demonstration
1111
* - Requirement 2.3: Memory Alignment for SIMD
12-
* - Requirement 2.4: Prefetch Demonstration
1312
*/
1413

1514
#pragma once
@@ -95,7 +94,13 @@ aligned_unique_ptr<T> make_aligned(std::size_t count,
9594
//------------------------------------------------------------------------------
9695

9796
/**
98-
* @brief STL-compatible allocator with custom alignment
97+
* @brief Cache-line aligned allocator for STL containers
98+
*
99+
* Uses a compile-time constant alignment (default: CACHE_LINE_SIZE).
100+
* Designed for false-sharing elimination in multi-threaded code.
101+
*
102+
* See CONTEXT.md: cache-line allocator for the domain rationale.
103+
* For SIMD-width alignment, see hpc::simd::AlignedAllocator in simd_utils.hpp.
99104
*/
100105
template <typename T, std::size_t Alignment = hpc::core::CACHE_LINE_SIZE>
101106
class AlignedAllocator {
@@ -169,58 +174,4 @@ struct alignas(hpc::core::CACHE_LINE_SIZE) CacheLinePadded {
169174
const T* operator->() const { return &value; }
170175
};
171176

172-
//------------------------------------------------------------------------------
173-
// Prefetch Hints
174-
//------------------------------------------------------------------------------
175-
176-
/**
177-
* @brief Prefetch data for reading
178-
*/
179-
template <typename T>
180-
inline void prefetch_read(const T* ptr) {
181-
#if defined(__GNUC__) || defined(__clang__)
182-
__builtin_prefetch(ptr, 0, 3); // Read, high temporal locality
183-
#elif defined(_MSC_VER)
184-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
185-
#endif
186-
}
187-
188-
/**
189-
* @brief Prefetch data for writing
190-
*/
191-
template <typename T>
192-
inline void prefetch_write(T* ptr) {
193-
#if defined(__GNUC__) || defined(__clang__)
194-
__builtin_prefetch(ptr, 1, 3); // Write, high temporal locality
195-
#elif defined(_MSC_VER)
196-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
197-
#endif
198-
}
199-
200-
/**
201-
* @brief Prefetch with specified locality hint
202-
* @param locality 0 = non-temporal, 3 = high temporal locality
203-
*/
204-
template <typename T>
205-
inline void prefetch(const T* ptr, int locality = 3) {
206-
#if defined(__GNUC__) || defined(__clang__)
207-
__builtin_prefetch(ptr, 0, locality);
208-
#elif defined(_MSC_VER)
209-
switch (locality) {
210-
case 0:
211-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_NTA);
212-
break;
213-
case 1:
214-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T2);
215-
break;
216-
case 2:
217-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T1);
218-
break;
219-
default:
220-
_mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0);
221-
break;
222-
}
223-
#endif
224-
}
225-
226177
} // namespace hpc::memory

examples/02-memory-cache/src/prefetch.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
#include <random>
1919
#include <vector>
2020

21-
#include "memory_utils.hpp"
22-
2321
namespace hpc::memory {
2422

2523
//------------------------------------------------------------------------------
@@ -49,9 +47,11 @@ int64_t sum_with_prefetch(const int64_t* data, size_t n) {
4947

5048
int64_t sum = 0;
5149
for (size_t i = 0; i < n; ++i) {
52-
// Prefetch future data
50+
// Prefetch future data (read, high temporal locality)
5351
if (i + PREFETCH_DISTANCE < n) {
54-
prefetch_read(&data[i + PREFETCH_DISTANCE]);
52+
#if defined(__GNUC__) || defined(__clang__)
53+
__builtin_prefetch(&data[i + PREFETCH_DISTANCE], 0, 3);
54+
#endif
5555
}
5656
sum += data[i];
5757
}
@@ -85,7 +85,9 @@ int64_t sum_random_with_prefetch(const int64_t* data, const size_t* indices, siz
8585
for (size_t i = 0; i < n; ++i) {
8686
// Prefetch data for future iterations
8787
if (i + PREFETCH_DISTANCE < n) {
88-
prefetch_read(&data[indices[i + PREFETCH_DISTANCE]]);
88+
#if defined(__GNUC__) || defined(__clang__)
89+
__builtin_prefetch(&data[indices[i + PREFETCH_DISTANCE]], 0, 3);
90+
#endif
8991
}
9092
sum += data[indices[i]];
9193
}
@@ -122,7 +124,9 @@ int64_t sum_list_with_prefetch(const Node* head) {
122124
for (const Node* node = head; node != nullptr; node = node->next) {
123125
// Prefetch next node
124126
if (node->next != nullptr) {
125-
prefetch_read(node->next);
127+
#if defined(__GNUC__) || defined(__clang__)
128+
__builtin_prefetch(node->next, 0, 3);
129+
#endif
126130
}
127131
sum += node->value;
128132
}

examples/03-modern-cpp/include/buffer.hpp

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <cstring>
1717
#include <utility>
1818

19+
#include "instrumentation.hpp"
20+
1921
namespace hpc::move_semantics {
2022

2123
/**
@@ -29,51 +31,56 @@ namespace hpc::move_semantics {
2931
*/
3032
class Buffer {
3133
public:
32-
Buffer() : data_(nullptr), size_(0) {}
34+
Buffer() : data_(nullptr), size_(0), metrics_(nullptr) {}
3335

34-
explicit Buffer(size_t size) : data_(new char[size]), size_(size) {
36+
explicit Buffer(size_t size, instrumentation::OperationMetrics* metrics = nullptr)
37+
: data_(new char[size]), size_(size), metrics_(metrics) {
3538
std::memset(data_, 0, size_);
3639
}
3740

3841
~Buffer() { delete[] data_; }
3942

40-
Buffer(const Buffer& other) : data_(nullptr), size_(other.size_) {
43+
Buffer(const Buffer& other, instrumentation::OperationMetrics* metrics = nullptr)
44+
: data_(nullptr), size_(other.size_), metrics_(metrics ? metrics : other.metrics_) {
4145
if (size_ > 0) {
4246
data_ = new char[size_];
4347
std::memcpy(data_, other.data_, size_);
4448
}
45-
++copy_count_;
49+
notify_copy();
4650
}
4751

4852
Buffer& operator=(const Buffer& other) {
4953
if (this != &other) {
5054
delete[] data_;
5155
size_ = other.size_;
56+
metrics_ = other.metrics_;
5257
if (size_ > 0) {
5358
data_ = new char[size_];
5459
std::memcpy(data_, other.data_, size_);
5560
} else {
5661
data_ = nullptr;
5762
}
58-
++copy_count_;
63+
notify_copy();
5964
}
6065
return *this;
6166
}
6267

63-
Buffer(Buffer&& other) noexcept : data_(other.data_), size_(other.size_) {
68+
Buffer(Buffer&& other) noexcept
69+
: data_(other.data_), size_(other.size_), metrics_(other.metrics_) {
6470
other.data_ = nullptr;
6571
other.size_ = 0;
66-
++move_count_;
72+
notify_move();
6773
}
6874

6975
Buffer& operator=(Buffer&& other) noexcept {
7076
if (this != &other) {
7177
delete[] data_;
7278
data_ = other.data_;
7379
size_ = other.size_;
80+
metrics_ = other.metrics_;
7481
other.data_ = nullptr;
7582
other.size_ = 0;
76-
++move_count_;
83+
notify_move();
7784
}
7885
return *this;
7986
}
@@ -82,21 +89,20 @@ class Buffer {
8289
char* data() { return data_; }
8390
const char* data() const { return data_; }
8491

85-
static size_t copy_count_;
86-
static size_t move_count_;
87-
88-
static void reset_counts() {
89-
copy_count_ = 0;
90-
move_count_ = 0;
91-
}
92-
9392
private:
9493
char* data_;
9594
size_t size_;
96-
};
95+
instrumentation::OperationMetrics* metrics_;
9796

98-
inline size_t Buffer::copy_count_ = 0;
99-
inline size_t Buffer::move_count_ = 0;
97+
void notify_copy() const {
98+
if (metrics_)
99+
metrics_->record_copy();
100+
}
101+
void notify_move() const {
102+
if (metrics_)
103+
metrics_->record_move();
104+
}
105+
};
100106

101107
//------------------------------------------------------------------------------
102108
// Functions demonstrating copy vs move
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/**
2+
* @file instrumentation.hpp
3+
* @brief Operation metrics collection with RAII scope management
4+
*
5+
* Replaces global static counters (e.g., Buffer::copy_count_,
6+
* CountingAllocator::allocation_count_) with an injectable, non-static
7+
* seam. Each test or demo creates a local OperationMetrics instance,
8+
* passes it to the module under observation, and uses the Scope helper
9+
* for automatic lifecycle management.
10+
*
11+
* Key concepts:
12+
* - Injectability: metrics are passed by pointer, not baked into class state
13+
* - RAII scope: OperationMetrics::Scope resets on construction, no manual cleanup
14+
* - Zero overhead: nullptr means "do not observe"
15+
* - Thread-local by default: each scope owns its own counters
16+
*
17+
* @example
18+
* hpc::instrumentation::OperationMetrics metrics;
19+
* hpc::instrumentation::OperationMetrics::Scope scope(metrics);
20+
* Buffer buf(128, &metrics);
21+
* Buffer copy(buf); // metrics.copy_count == 1
22+
*/
23+
24+
#pragma once
25+
26+
#include <cstddef>
27+
#include <cstdint>
28+
29+
namespace hpc::instrumentation {
30+
31+
/**
32+
* @brief Generic operation counter for copy/move/allocation events.
33+
*
34+
* A single seam that replaces the scattered static counters previously
35+
* embedded in Buffer and CountingAllocator. By making the counter
36+
* external and injectable, tests no longer suffer from order-dependent
37+
* state pollution, and benchmarks pay zero overhead when metrics is
38+
* nullptr.
39+
*/
40+
class OperationMetrics {
41+
public:
42+
size_t copy_count = 0;
43+
size_t move_count = 0;
44+
size_t allocation_count = 0;
45+
size_t deallocation_count = 0;
46+
size_t total_bytes_allocated = 0;
47+
size_t total_bytes_deallocated = 0;
48+
49+
/**
50+
* @brief Reset all counters to zero.
51+
*/
52+
void reset() noexcept {
53+
copy_count = 0;
54+
move_count = 0;
55+
allocation_count = 0;
56+
deallocation_count = 0;
57+
total_bytes_allocated = 0;
58+
total_bytes_deallocated = 0;
59+
}
60+
61+
/**
62+
* @brief RAII scope that resets metrics on entry.
63+
*
64+
* Guarantees each test block starts from a clean slate without
65+
* manual reset_counts() calls.
66+
*/
67+
class Scope {
68+
public:
69+
explicit Scope(OperationMetrics& m) : metrics_(m) { metrics_.reset(); }
70+
~Scope() = default;
71+
72+
Scope(const Scope&) = delete;
73+
Scope& operator=(const Scope&) = delete;
74+
Scope(Scope&&) = delete;
75+
Scope& operator=(Scope&&) = delete;
76+
77+
OperationMetrics& metrics() noexcept { return metrics_; }
78+
const OperationMetrics& metrics() const noexcept { return metrics_; }
79+
80+
private:
81+
OperationMetrics& metrics_;
82+
};
83+
84+
// Named event recorders for type safety and locality
85+
void record_copy() noexcept { ++copy_count; }
86+
void record_move() noexcept { ++move_count; }
87+
void record_allocation(size_t bytes) noexcept {
88+
++allocation_count;
89+
total_bytes_allocated += bytes;
90+
}
91+
void record_deallocation(size_t bytes) noexcept {
92+
++deallocation_count;
93+
total_bytes_deallocated += bytes;
94+
}
95+
};
96+
97+
} // namespace hpc::instrumentation

examples/03-modern-cpp/include/ranges_utils.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
/**
22
* @file ranges_utils.hpp
3-
* @brief C++20 Ranges utility functions
3+
* @brief C++20 Ranges comparison utilities (teaching module)
44
*
5-
* This header provides utility functions comparing C++20 ranges
6-
* with traditional raw loops for performance analysis.
5+
* **Note: This is a teaching example module, not production-ready code.**
6+
* Each function is intentionally shallow — its interface exposes the
7+
* implementation strategy (raw_loop / algorithm / ranges). The value
8+
* is in side-by-side comparison, not in abstraction depth.
79
*
8-
* Key concepts:
10+
* Key concepts demonstrated:
911
* - std::ranges algorithms
1012
* - Range views (lazy evaluation)
1113
* - Compiler optimization of ranges

0 commit comments

Comments
 (0)