From b69a9f862a6e23e56e7bc0b6f8319a11e1d473dd Mon Sep 17 00:00:00 2001 From: Charliechen114514 <725610365@qq.com> Date: Sat, 13 Jun 2026 20:28:20 +0800 Subject: [PATCH 1/2] fix: clear the little todos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 本轮把 14 卷 TODO 里堆积的一批 quick 级缺口一次清掉,共 9 项,并同步英文翻译。 正确性修复: - vol4/05-spaceship-operator「坑1」: 旧结论「只写 <=> 不写 == 会编译错误」已过时 (P1185 后 defaulted <=> 隐式生成 ==,g++ 16.1.1 / clang 22.1.6 双编译器实测)。 改写为真实会踩的坑「默认 == 不会反向生成 <=>」,附双编译器实测输出。 - vol9 once_callback: enum Status 补 : uint8_t(与正文一致)、补 /、 测试计数笔误 11→12(实测源码确为 12 个 TEST_CASE)、裸代码块补语言标记。 内容补充与交叉链接: - vol2: constexpr-ctor frontmatter 补 C++20、第五节标题「(C++20 预告)」→「(C++20)」; udl-basics 补「小结」。 - vol5: ch09 两篇加「概念导览」标注;补三条跨卷链接(ch06→卷四协程、 ch08→卷六性能、ch03→卷八中断安全)。 - vol10: 6 处行内代码路径改为可点的 GitHub blob 链接;汇编系列 3 篇加延伸阅读 (卷六 AVX/AVX2、卷七 编译器选项)。 导航与结构整理: - projects/index 从空壳升级为可用导航页(4 条跨卷/外链 + 近期/规划项目)。 - vol8 嵌入式补 00/01/02 三个目录的 index.md;embedded/index 去掉「规划中」、 纳入四系列入口。 - 卷八删除冗余 06-array-vs-raw-arrays(卷三 01-array 已覆盖);EBO 从 embedded(stm32f1) 迁到 vol4(host),CN/EN 全链路同步。 - vol1 c_tutorials 49 处「// TODO:」统一改为「// 练习:」,消除规划噪音。 治理: - todo/000、todo/031 勾掉已完成的验收项;修正 091 stale 引用。 英文同步: - 用 translate.py(glm-4.6) 同步本轮改动文件的英文镜像,并重译修复 ch07 被污染成广告内容的英文镜像. (LLM翻译还是要指日可待啊) --- README.md | 2 +- .../once_callback/once_callback.hpp | 10 +- documents/en/projects/index.md | 39 +- .../12-struct-and-memory-alignment.md | 457 +++++------- .../13-union-enum-bitfield-typedef.md | 459 ++++++------ .../c_tutorials/14-dynamic-memory.md | 265 +++---- .../15-preprocessor-and-multifile.md | 278 ++++---- .../c_tutorials/16-file-io-and-stdlib.md | 259 +++---- .../01-arm-architecture-fundamentals.md | 404 +++++------ .../03-c-traps-and-pitfalls.md | 482 +++++-------- .../05-handmade-dynamic-array.md | 429 ++++++----- .../01-type-safety-and-number-concept.md | 238 +++---- ...rsonal-journey-and-from-assembly-to-cpp.md | 188 ++--- .../02-reading-assembly-and-registers-abi.md | 328 ++++----- .../03-compiler-explorer-and-ai-assisted.md | 337 ++++----- .../06-toolchain-and-project-design.md | 297 ++++---- ...standardization-and-assembly-philosophy.md | 325 ++++----- .../ch02-constexpr/02-constexpr-ctor.md | 417 ++++------- .../01-udl-basics.md | 359 +++------- .../03-empty-base-optimization.md | 127 ++++ .../en/vol4-advanced/05-spaceship-operator.md | 198 +++--- documents/en/vol4-advanced/index.md | 17 +- .../05-atomic-patterns.md | 669 +++++------------- .../01-async-programming-evolution.md | 375 ++++------ .../ch07-actor-channel/index.md | 23 +- .../02-concurrency-benchmarks.md | 532 +++++--------- .../01-from-concurrent-to-distributed.md | 310 +++----- .../02-distributed-primitives.md | 164 ++--- .../embedded/00-env-setup/index.md | 30 + .../en/vol8-domains/embedded/01-led/index.md | 50 ++ .../vol8-domains/embedded/02-button/index.md | 49 ++ .../embedded/04-empty-base-optimization.md | 128 ---- .../embedded/06-array-vs-raw-arrays.md | 94 --- .../embedded/core-embedded-cpp-index.md | 49 +- documents/en/vol8-domains/embedded/index.md | 59 +- .../01-6-once-callback-testing-and-perf.md | 252 +++---- .../chrome/01_once_callback/full/index.md | 44 +- .../hands_on/03-once-callback-testing.md | 273 ++++--- documents/projects/index.md | 29 +- .../12-struct-and-memory-alignment.md | 8 +- .../13-union-enum-bitfield-typedef.md | 32 +- .../c_tutorials/14-dynamic-memory.md | 12 +- .../15-preprocessor-and-multifile.md | 12 +- .../c_tutorials/16-file-io-and-stdlib.md | 6 +- .../01-arm-architecture-fundamentals.md | 8 +- .../03-c-traps-and-pitfalls.md | 12 +- .../05-handmade-dynamic-array.md | 8 +- .../01-type-safety-and-number-concept.md | 18 +- ...rsonal-journey-and-from-assembly-to-cpp.md | 2 +- .../02-reading-assembly-and-registers-abi.md | 5 + .../03-compiler-explorer-and-ai-assisted.md | 5 + .../06-toolchain-and-project-design.md | 6 + ...standardization-and-assembly-philosophy.md | 10 +- .../ch02-constexpr/02-constexpr-ctor.md | 4 +- .../01-udl-basics.md | 11 + .../03-empty-base-optimization.md} | 13 +- .../vol4-advanced/05-spaceship-operator.md | 54 +- documents/vol4-advanced/index.md | 1 + .../05-atomic-patterns.md | 2 + .../01-async-programming-evolution.md | 2 + .../02-concurrency-benchmarks.md | 2 + .../01-from-concurrent-to-distributed.md | 2 + .../02-distributed-primitives.md | 2 + documents/vol5-concurrency/index.md | 2 +- .../embedded/00-env-setup/index.md | 24 + .../vol8-domains/embedded/01-led/index.md | 44 ++ .../vol8-domains/embedded/02-button/index.md | 43 ++ .../embedded/06-array-vs-raw-arrays.md | 88 --- .../embedded/core-embedded-cpp-index.md | 3 +- documents/vol8-domains/embedded/index.md | 23 +- .../01-6-once-callback-testing-and-perf.md | 2 +- .../chrome/01_once_callback/full/index.md | 2 +- .../hands_on/03-once-callback-testing.md | 2 +- todo/000-project-roadmap.md | 8 +- todo/031-qa-knowledge-base.md | 6 +- 75 files changed, 4056 insertions(+), 5473 deletions(-) create mode 100644 documents/en/vol4-advanced/03-empty-base-optimization.md create mode 100644 documents/en/vol8-domains/embedded/00-env-setup/index.md create mode 100644 documents/en/vol8-domains/embedded/01-led/index.md create mode 100644 documents/en/vol8-domains/embedded/02-button/index.md delete mode 100644 documents/en/vol8-domains/embedded/04-empty-base-optimization.md delete mode 100644 documents/en/vol8-domains/embedded/06-array-vs-raw-arrays.md rename documents/{vol8-domains/embedded/04-empty-base-optimization.md => vol4-advanced/03-empty-base-optimization.md} (97%) create mode 100644 documents/vol8-domains/embedded/00-env-setup/index.md create mode 100644 documents/vol8-domains/embedded/01-led/index.md create mode 100644 documents/vol8-domains/embedded/02-button/index.md delete mode 100644 documents/vol8-domains/embedded/06-array-vs-raw-arrays.md diff --git a/README.md b/README.md index 2c2f476ab..406c1b4e6 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ --- -![English Coverage](https://img.shields.io/badge/en_coverage-99%25-green.svg) 428/431 docs translated +![English Coverage](https://img.shields.io/badge/en_coverage-99%25-green.svg) 430/433 docs translated ## 这是什么项目 diff --git a/code/volumn_codes/vol9/chrome_design/once_callback/once_callback.hpp b/code/volumn_codes/vol9/chrome_design/once_callback/once_callback.hpp index c5449c333..3a642a172 100644 --- a/code/volumn_codes/vol9/chrome_design/once_callback/once_callback.hpp +++ b/code/volumn_codes/vol9/chrome_design/once_callback/once_callback.hpp @@ -11,7 +11,9 @@ #pragma once #include "cancel_token/cancel_token.hpp" +#include #include +#include namespace tamcpp::chrome { @@ -38,7 +40,7 @@ class OnceCallback // Specialization of Functional like using FuncSig = ReturnType(FuncArgs...); private: - enum class Status { + enum class Status : uint8_t { kEmpty, // Null when construction with no lambda or func specified kValid, // validate for usage kConsumed // Has been callbacked @@ -97,8 +99,10 @@ class OnceCallback // Specialization of Functional like * @return false */ [[nodiscard]] bool is_cancelled() const noexcept { - if (status_ != Status::kValid) return true; - if (token_ && !token_->is_valid()) return true; + if (status_ != Status::kValid) + return true; + if (token_ && !token_->is_valid()) + return true; return false; } [[nodiscard]] bool maybe_valid() const noexcept { return !is_cancelled(); } diff --git a/documents/en/projects/index.md b/documents/en/projects/index.md index d18adbd6e..713e1c9ae 100644 --- a/documents/en/projects/index.md +++ b/documents/en/projects/index.md @@ -1,7 +1,7 @@ --- -title: End-to-End Hands-On Project -description: Handwritten STL, HTTP server, GUI framework, embedded OS, and other comprehensive - projects +title: Comprehensive Hands-on Project +description: Synthesize scattered knowledge from various volumes into complete projects—from + coroutine servers, mini runtimes, to industrial-grade component analysis. platform: host tags: - cpp-modern @@ -9,26 +9,31 @@ tags: - intermediate translation: source: documents/projects/index.md - source_hash: 3a6ccce746db47489eb825a9ddde30ec4ba65e96ff42697b0adc3c93c6be84df - translated_at: '2026-05-26T10:20:16.450396+00:00' + source_hash: 42eb53b42495d01aa1fb318f2084582052831d6c383d0cf22598181cb3c8fd75 + translated_at: '2026-06-13T11:40:38.476774+00:00' engine: anthropic - token_count: 106 + token_count: 324 --- # Comprehensive Hands-on Projects -> Status: Planned +> This section is not merely a pile of new concepts, but an effort to weave together the fragments learned across various volumes—concurrency, coroutines, templates, memory management—into a complete project that runs, tests, and can be delivered. Below, we first list projects that have already been implemented in other volumes and are ready for you to continue, followed by long-term goals still in the planning phase. -## Overview +## Projects with a Foundation -This section features comprehensive hands-on projects that span the entire tutorial: +These projects already have tutorials or runnable skeletons in other volumes, providing a path for you to dive deeper: -1. **Hand-rolled STL Components**: vector, string, unique_ptr, optional, function, variant -2. **Mini HTTP Server**: from TCP to asynchronous coroutines -3. **Mini GUI Framework**: event loop, widget system, layout engine, rendering backend -4. **Embedded Operating System**: scheduler, synchronization primitives, memory management, driver framework -5. **INI Parser** (basic version available) -6. **Coroutine Echo Server** (basic version available) +- **Coroutine Echo Server**: In [Volume 5: Coroutine Echo Server](../vol5-concurrency/ch06-async-io-coroutine/05-coroutine-echo-server.md), we built a fully functional echo service from `io_uring` to data transmission. This is the most practical project for understanding coroutine scheduling. +- **Mini Concurrent Runtime (Capstone)**: [Volume 5: Mini Runtime Capstone](../vol5-concurrency/exercises/06-capstone-mini-runtime.md) combines thread pools, timers, and task queues into a minimal scheduler, serving as a ready-made starting point for the "Mini Concurrent Runtime". +- **OnceCallback Component Study**: [Volume 9: OnceCallback](../vol9-open-source-project-learn/chrome/01_once_callback/index.md) dissects Chromium's callback mechanism across 16 articles, serving as a paradigm for transitioning from reading source code to designing industrial-grade components yourself. +- **INI Parser**: As the first complete project for C++ engineering, this is located in a separate repository [Tutorial_cpp_SimpleIniParser](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_cpp_SimpleIniParser)—covering lexical analysis to error handling, it is perfect for following along from start to finish. -## Project Navigation +## Planned Projects -> Content is being written, stay tuned. +These have not yet started and are long-term goals, sorted by "readiness of materials": + +- **Hand-rolled STL Components**: Implement `vector`, `string`, `unique_ptr`, `optional`, `function`, and `variant` from scratch, complementing the source code reading in Volume 3 (Standard Library). +- **Mini HTTP Server**: From TCP sockets to coroutine-based asynchrony, building upon Volume 5 (Concurrency) and Volume 8 (Network Programming). +- **Mini GUI Framework**: Event loops, widget systems, layout engines, and rendering backends. +- **Embedded Mini OS**: Scheduler, synchronization primitives, memory management, and driver framework, extending the main thread of Volume 8 (Embedded Systems). + +> These projects will not be completed overnight; they will be launched gradually as the content in their respective volumes is finalized. If you have a project you would like to see, feel free to propose it in the Discussions. diff --git a/documents/en/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md b/documents/en/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md index c088ba37b..166c82d8d 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md +++ b/documents/en/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md @@ -3,7 +3,7 @@ chapter: 1 cpp_standard: - 11 description: Master struct definitions, memory alignment and padding rules, flexible - array members, and `offsetof` validation + array members, and `offsetof` validation. difficulty: beginner order: 16 platform: host @@ -19,490 +19,357 @@ tags: title: Structs and Memory Alignment translation: source: documents/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md - source_hash: b0fa66698cfdb3a2581c038aaba68ef7299ab625d4ccbe410a6e5587ad0f7432 - translated_at: '2026-05-26T10:33:01.215031+00:00' + source_hash: 1da76ff6fd68afc58f2076c9eb1028dc79bb5512d8d1c5ebdc53e6d00facaedb + translated_at: '2026-06-13T11:41:51.648816+00:00' engine: anthropic - token_count: 3333 + token_count: 3331 --- # Structs and Memory Alignment -If you have been writing C up to this point and have only used basic types—`int`, `float`, `char`, and the like—it is probably because you have not yet encountered a scenario where you need to bundle a group of related data together for passing around. Once you start writing more substantial programs, such as a sensor data packet, a configuration table, or a communication protocol frame, you will find that loose variables are simply unmanageable. The struct is C's answer to this: it lets us combine data of different types into a single whole, which we can then pass, store, and operate on as one value. +If you have been writing C code until now using only basic types—like `int`, `float`, `char`—it is likely because you haven't encountered a scenario where you need to pass a group of related data together. Once you start writing slightly more sophisticated programs, such as a sensor data packet, a configuration table, or a communication protocol frame, you will find that relying on scattered variables is impossible to manage. The struct is the answer C provides: it allows us to knead different types of data into a whole, which can then be passed, stored, and manipulated as a single value. -But structs are far more than just "bundling data." The moment we place a struct in memory, the compiler does something behind the scenes that you might never have considered—memory alignment. It silently slips padding bytes between your fields so that each field lands on an address the processor "likes." If you are unaware of this, there will come a day when you are designing a binary protocol frame, doing a DMA (Direct Memory Access) transfer, or hand-writing serialization code, and those phantom bytes will make you question your sanity. +But structs are far more than just "bundling data." The moment we put a struct into memory, the compiler does something behind the scenes that you might never have thought of—memory alignment. It secretly inserts padding bytes between your fields so that each field lands on an address the processor "likes." If you are unaware of this, one day when designing binary protocol frames, doing DMA transfers, or writing serialization code by hand, you will likely be driven to the brink of madness by these ghost bytes. -So in this chapter, we will not only learn how to define and use structs, but also thoroughly understand what a struct truly looks like in memory. +So, in this chapter, we will not only learn how to define and use structs but also thoroughly understand their true appearance in memory. > **Learning Objectives** > > After completing this chapter, you will be able to: > -> - [ ] Proficiently define, initialize, and operate on structs and their pointers -> - [ ] Understand the principles of memory alignment and the distribution rules for padding bytes -> - [ ] Use `_Alignas`, `alignof`, and `offsetof` for alignment control and verification -> - [ ] Master the use of designated initializers and flexible array members -> - [ ] Understand the evolutionary relationship from C structs to C++ classes +> - [ ] Proficiently define, initialize, and operate on structs and their pointers. +> - [ ] Understand the principles of memory alignment and the distribution rules of padding bytes. +> - [ ] Use `alignas`, `alignof`, and `offsetof` for alignment control and verification. +> - [ ] Master the use of designated initializers and flexible array members. +> - [ ] Understand the evolutionary relationship from C structs to C++ classes. ## Environment Setup -All of our following experiments will be conducted in this environment: +We will conduct all subsequent experiments in the following environment: -- Platform: Linux x86\_64 (WSL2 is also fine) +- Platform: Linux x86_64 (WSL2 is acceptable) - Compiler: GCC 13+ or Clang 17+ -- Compiler flags: `-Wall -Wextra -std=c17` +- Compiler flags: `-std=c2x -Wall -Wextra` -## Step 1 — Mastering Struct Definition and Basic Operations +## Step 1 — Master Struct Definition and Basic Operations ### Defining a Struct -In C, we define a struct using the `struct` keyword followed by a pair of curly braces: +In C, we define a struct using the `struct` keyword followed by a pair of braces: ```c -struct SensorReading { - uint32_t timestamp; - float temperature; - float humidity; - uint8_t status; +struct SensorData { + int id; + float value; + char status; }; ``` -Note the semicolon at the end—forgetting it is one of the most common compilation errors for beginners, and the error message usually points to the next line, leaving you baffled. `struct SensorReading` is now a type name, but writing `struct SensorReading` every time is indeed a bit verbose, so we usually pair it with `typedef` to simplify: +Note that semicolon at the end—forgetting it is one of the most common compilation errors for beginners, and the error message usually points to the next line, leaving you confused. `struct SensorData` is now a type name, but writing `struct SensorData` every time is indeed a bit verbose, so we usually pair it with `typedef` to simplify: ```c -typedef struct { - uint32_t timestamp; - float temperature; - float humidity; - uint8_t status; -} SensorReading; +typedef struct SensorData { + int id; + float value; + char status; +} SensorData; ``` -This way we can write `SensorReading reading;` directly to declare variables, which is much cleaner. The two approaches are functionally equivalent; the only difference is how the type name is used: the former requires the `struct` prefix, while the latter does not. In real-world projects, the `typedef` approach is far more common, especially in embedded development—if you look at any MCU (Microcontroller Unit) vendor's SDK, it is full of `typedef struct`. +Now we can write `SensorData` directly to declare variables, which is much cleaner. The two styles are functionally equivalent; the difference lies only in the usage of the type name: the former requires the `struct` prefix, while the latter does not. In actual projects, the `typedef` usage is more prevalent, especially in embedded development—look at any MCU vendor's SDK, and you will see `typedef struct` everywhere. ### Initialization and Assignment -There are several ways to initialize a struct, and we will start with the most basic. The first is sequential initialization—providing values in the order the fields are defined: +There are several ways to initialize a struct. Let's start with the most basic. The first is sequential initialization—providing values in the order the fields are defined: ```c -SensorReading r1 = {1700000000, 23.5f, 60.0f, 1}; +struct SensorData sensor = {1, 25.4f, 'OK'}; ``` -This works, but its readability is not great—you have to remember which position corresponds to which field, and if the struct definition changes the order, all initialization code must be updated accordingly. C99 gave us a better solution: the **designated initializer**, which allows you to initialize any field by name: +This approach works, but readability is poor—you must remember which position corresponds to which field. Once the struct definition order is adjusted, all initialization code must be modified. C99 offers a better solution: **designated initializers**, which allow you to initialize arbitrary fields by name: ```c -SensorReading r2 = { - .timestamp = 1700000000, - .temperature = 23.5f, - .humidity = 60.0f, - .status = 1 -}; - -// 不需要按定义顺序,也可以只初始化部分字段 -SensorReading r3 = { - .humidity = 45.0f, - .status = 0 - // timestamp 和 temperature 自动初始化为 0 +struct SensorData sensor = { + .id = 1, + .value = 25.4f, + .status = 'OK' }; ``` -The benefits of designated initializers are obvious: the code is self-documenting, it does not depend on field order, and unspecified fields are automatically zeroed out. Frankly, in modern C code, as long as your compiler supports C99 (which basically all of them do), you should prefer designated initializers. +The benefits of designated initializers are obvious: the code is self-documenting, independent of field order, and unspecified fields are automatically zeroed. Honestly, in modern C code, as long as your compiler supports C99 (which basically all do), you should prefer designated initializers. -Struct assignment and initialization are two different things. Initialization happens at declaration, while assignment happens after declaration. C allows direct assignment between structs of the same type, which performs a byte-by-byte copy: +Struct assignment and initialization are two different things. Initialization happens at declaration; assignment happens after declaration. C allows direct assignment between structs of the same type, which is a byte-by-byte copy: ```c -SensorReading r4; -r4 = r2; // 把 r2 的所有字段复制到 r4 +struct SensorData sensor1 = {1, 25.4f, 'OK'}; +struct SensorData sensor2; +sensor2 = sensor1; // Shallow copy ``` -But be careful: struct assignment in C is a **shallow copy**—if the struct contains pointer members, the pointer fields of both structs will point to the same block of memory after assignment. This is a classic pitfall when dealing with structs that contain dynamically allocated memory. +But be aware: struct assignment in C is a **shallow copy**—if a struct contains pointer members, after assignment, the pointer fields in both structs will point to the same memory block. This is a classic pitfall when handling structs containing dynamically allocated memory. ### Struct Pointers and the Arrow Operator -When a struct is large, or when we need to modify the caller's struct inside a function, passing a pointer is the only reasonable approach. This is where the difference between `.` and `->` comes in: +When a struct is large, or we need to modify the caller's struct within a function, passing a pointer is the only reasonable approach. This is where the difference between `.` and `->` comes in: ```c -SensorReading reading = { - .timestamp = 1700000000, - .temperature = 25.0f, - .humidity = 50.0f, - .status = 1 -}; - -// 通过变量名直接访问——用点号 -reading.temperature = 26.0f; +struct SensorData sensor; +struct SensorData *ptr = &sensor; -// 通过指针访问——用箭头 -SensorReading* ptr = &reading; -ptr->humidity = 55.0f; -// 等价于 (*ptr).humidity = 55.0f +sensor.id = 1; // Direct member access +ptr->id = 2; // Member access via pointer +(*ptr).id = 3; // Equivalent to ptr->id ``` -The `->` operator is simply syntactic sugar for `(*ptr).`, nothing mysterious. But this syntactic sugar is so commonly used that you would almost never write `(*ptr).`—in C, as long as a function parameter includes a struct pointer, you are almost certainly using `->`. +The `->` operator is just syntactic sugar for `(*ptr).`, nothing mysterious. But this sugar is so commonly used that you will hardly ever write `(*ptr).`—in C, as long as a function parameter involves a struct pointer, you are almost certainly using `->`. -Passing a struct pointer rather than the struct itself as a function parameter not only avoids expensive copy overhead but also allows the function to modify the caller's data. If you do not want the function to modify the data, just add `const`: +Passing a struct pointer instead of the struct itself in function parameters not only avoids expensive copy overhead but also allows the function to modify the caller's data. If you do not want the function to modify the data, just add `const`: ```c -/// @brief 打印传感器读数(只读访问) -void print_reading(const SensorReading* r) { - printf("T=%.1fC H=%.1f%% status=%u\n", - r->temperature, r->humidity, r->status); -} - -/// @brief 更新传感器状态(可修改) -void update_status(SensorReading* r, uint8_t new_status) { - r->status = new_status; +void print_sensor(const struct SensorData *s) { + printf("ID: %d\n", s->id); } ``` -This distinction between `const SensorReading*` and `SensorReading*` is inherited in C++ into `const` member functions and reference semantics, forming a more complete "read-only vs. mutable" interface design. +This distinction between `T*` and `const T*` is inherited in C++ as `const` member functions and reference semantics, forming a more complete "read-only vs. mutable" interface design. ## Step 2 — Understanding Memory Alignment and Padding Bytes -Next, we are entering the most core and potentially most confusing part of this tutorial. Let us start with a question: how many bytes does the following struct occupy? +Next, we enter the core and most confusing part of this tutorial. Let's look at a question first: how many bytes does the following struct occupy? ```c -typedef struct { - uint8_t a; // 1 字节 - uint32_t b; // 4 字节 - uint8_t c; // 1 字节 -} WeirdLayout; +struct BadLayout { + char a; // 1 byte + int b; // 4 bytes + char c; // 1 byte +}; ``` -Intuitively, 1 + 4 + 1 = 6 bytes, right? But in reality, on most 32-bit and 64-bit platforms, `sizeof(WeirdLayout)` is **12 bytes**. Where did the extra 6 bytes go? The answer is that the compiler inserted them into the struct as **padding bytes**. +Intuitively, 1 + 4 + 1 = 6 bytes, right? But actually, on most 32-bit and 64-bit platforms, `sizeof(struct BadLayout)` is **12 bytes**. Where did the extra 6 bytes go? The answer is they were inserted into the struct by the compiler as **padding bytes**. -### Why Alignment Is Needed +### Why Alignment is Needed -When a processor accesses memory, it does not read one byte at a time. Most CPU architectures prefer to access data along 2-, 4-, or 8-byte boundaries—this is what we call **alignment**. An `uint32_t` placed at an address that is a multiple of 4 can be read in a single operation; but if it straddles a 4-byte boundary (for example, placed at address 3), the CPU might need to read twice and stitch the results together, which incurs a performance penalty. Some architectures are even more extreme—they will throw a hardware exception directly (for instance, ARM triggers a fault when accessing unaligned addresses in certain modes). +When a processor accesses memory, it does not read byte by byte. Most CPU architectures prefer to access data on 2, 4, or 8-byte boundaries—this is called **alignment**. An `int` placed at an address that is a multiple of 4 can be read in one go; but if it straddles a 4-byte boundary (e.g., placed at address 3), the CPU might need to read twice and stitch it together, resulting in a performance hit. Some architectures are even more extreme—throwing a hardware exception directly (for example, ARM accessing unaligned addresses in certain modes triggers a fault). -So, for the sake of performance and correctness, the compiler inserts padding bytes between struct members to ensure that each member lands on its naturally aligned address. +So, for performance and correctness, the compiler inserts padding bytes between struct members to ensure each member lands on its naturally aligned address. -### Rules for Alignment and Padding +### Rules of Alignment and Padding -There are essentially only two alignment rules, but understanding them requires a bit of patience. The first rule: **the starting address of each member must be an integer multiple of that member's alignment requirement**. The alignment requirement for `uint8_t` is 1 (any address works), `uint16_t` is 2, `uint32_t` is 4, `double` and `uint64_t` are 8, and so on—the alignment requirement of a basic type usually equals its size. The second rule: **the total size of the struct itself must be an integer multiple of its largest alignment requirement**—this ensures that in an array of structs, every element satisfies the alignment requirement. +There are actually only two rules for alignment, but understanding them requires a bit of patience. Rule one: **The starting address of each member must be an integer multiple of that member's alignment requirement**. `char` has an alignment requirement of 1 (any address works), `short` is 2, `int` is 4, `double` and `long long` are 8, and so on—the alignment requirement of basic types usually equals their size. Rule two: **The size of the struct itself must be an integer multiple of its largest alignment requirement**—this is to ensure that in an array of structs, every element satisfies the alignment requirement. -Now let us return to the `WeirdLayout` example and map it out byte by byte: +Now let's return to the `struct BadLayout` example and draw it out byte by byte: ```text -偏移 0 1 2 3 4 5 6 7 8 9 10 11 - [a ][pad pad pad][b ][c ][pad pad pad] - ^ ^ ^ - | | b: 偏移 4(4 的倍数,满足) - | 填充 3 字节让 b 对齐到 4 - a: 偏移 0(1 的倍数,满足) +Address 0 1 2 3 4 5 6 7 8 9 10 11 + +---+---+---+---+---+---+---+---+---+---+---+---+ + | a | X | X | X | b | b | c | X | X | X | + +---+---+---+---+---+---+---+---+---+---+---+---+ ``` -`a` is at offset 0, occupying 1 byte. The alignment requirement of `b` is 4, but the next available offset is 1, which is not a multiple of 4, so the compiler inserts 3 bytes of padding, letting `b` start at offset 4. `c` is at offset 8, with an alignment requirement of 1, so that is fine. Finally, the struct's maximum alignment requirement is 4 (from `uint32_t b`), so the total size must be a multiple of 4—the current size is 9, so it is padded to 12. +`a` is at offset 0, occupying 1 byte. `b` has an alignment requirement of 4, but the next available offset is 1, which is not a multiple of 4, so the compiler inserts 3 bytes of padding, letting `b` start at offset 4. `c` is at offset 8, alignment requirement 1, no problem. Finally, the struct's maximum alignment requirement is 4 (from `int`), so the total size must be a multiple of 4—currently 9, so it is padded to 12. -This is why data that is only 6 bytes actually occupies 12 bytes—50% of the space is wasted on padding. +This is why明明 only 6 bytes of data actually occupy 12 bytes—50% of the space is wasted on padding. ### Reordering Fields to Reduce Padding -The solution to this problem is surprisingly simple: **place fields with larger alignment requirements first, and smaller ones last**. Let us reorder the fields of `WeirdLayout`: +The solution to this problem is surprisingly simple: **put fields with larger alignment requirements first, and smaller ones last**. Let's rearrange the fields of `struct BadLayout`: ```c -typedef struct { - uint32_t b; // 4 字节,偏移 0 - uint8_t a; // 1 字节,偏移 4 - uint8_t c; // 1 字节,偏移 5 - // 填充 2 字节(偏移 6-7),使总大小为 4 的倍数 -} BetterLayout; +struct GoodLayout { + int b; // 4 bytes + char a; // 1 byte + char c; // 1 byte +}; ``` -Now `sizeof(BetterLayout)` is **8 bytes**—saving one-third compared to the previous 12. `b` is at offset 0 (naturally aligned), `a` and `c` are packed tightly right after it, and only 2 bytes of trailing padding are needed at the end. This technique is extremely useful in real-world engineering, especially on memory-constrained embedded systems—building the habit of ordering fields from largest to smallest alignment requirement is well worth it. +Now `sizeof(struct GoodLayout)` is **8 bytes**—saving one-third compared to the previous 12. `b` is at offset 0 (naturally aligned), `a` and `c` are packed tightly after it, requiring only 2 bytes of tail padding. This technique is very useful in actual engineering, especially in memory-constrained embedded systems—developing the habit of ordering fields from largest to smallest alignment requirement is worth it. ### Verifying Offsets with offsetof -The C standard library provides the `offsetof` macro (defined in ``), which can precisely tell you the offset of a specific field within a struct. We frequently use it when debugging alignment issues or designing binary protocols: +The C standard library provides the `offsetof` macro (defined in ``), which can tell you precisely the offset of a field within a struct. We often use it when debugging alignment issues or designing binary protocols: ```c #include #include -printf("offset of a: %zu\n", offsetof(WeirdLayout, a)); // 0 -printf("offset of b: %zu\n", offsetof(WeirdLayout, b)); // 4 -printf("offset of c: %zu\n", offsetof(WeirdLayout, c)); // 8 -printf("total size: %zu\n", sizeof(WeirdLayout)); // 12 +printf("Offset of a: %zu\n", offsetof(struct GoodLayout, a)); +printf("Offset of b: %zu\n", offsetof(struct GoodLayout, b)); ``` -Make a habit of printing offsets with `offsetof` right after defining a struct, especially when designing communication protocol frames—you will find that some fields' offsets differ from what you expected, and this usually indicates an alignment issue. +Make it a habit to print offsets with `offsetof` after writing a struct, especially when designing communication protocol frames—you will find that some fields' offsets are different from what you expected, which usually means an alignment problem. -## C11 Alignment Control: _Alignas and alignof +## C11 Alignment Control: `_Alignas` and `alignof` -In the C99 era, if you needed to manually control alignment, you could only rely on compiler extensions—GCC's `__attribute__((aligned(n)))`, MSVC's `__declspec(align(n))`, and the like. C11 finally standardized this capability, providing the `_Alignas` and `_Alignof` keywords, as well as the more friendly macro aliases `alignas` and `alignof` (defined in ``). +In the C99 era, if you needed manual alignment control, you had to rely on compiler extensions—GCC's `__attribute__ ((aligned))`, MSVC's `__declspec(align(...))`, etc. C11 finally standardized this capability, providing the `_Alignas` and `_Alignof` keywords, as well as the more friendly macro aliases `alignas` and `alignof` (defined in ``). -### alignof: Querying Alignment Requirements +### `alignof`: Querying Alignment Requirements `alignof` can query the alignment requirement of any type: ```c #include -#include -printf("alignof(uint8_t) = %zu\n", alignof(uint8_t)); // 1 -printf("alignof(uint32_t) = %zu\n", alignof(uint32_t)); // 4 -printf("alignof(double) = %zu\n", alignof(double)); // 通常 8 -printf("alignof(WeirdLayout) = %zu\n", alignof(WeirdLayout)); // 4 +printf("Alignment of int: %zu\n", alignof(int)); // Usually 4 +printf("Alignment of double: %zu\n", alignof(double)); // Usually 8 ``` -A struct's alignment requirement equals the largest alignment requirement among its members. `WeirdLayout` contains `uint32_t`, so the overall alignment requirement is 4. +A struct's alignment requirement equals the largest alignment requirement among its members. `struct GoodLayout` has an `int`, so the overall alignment requirement is 4. -### alignas: Forcing Alignment +### `alignas`: Forcing Alignment -`alignas` can be used to force a variable or struct member to be allocated on a specified alignment boundary. This is very useful in embedded development—for example, DMA (Direct Memory Access) transfers typically require the buffer's starting address to be 4-byte or even 32-byte aligned: +`alignas` can be used to force a variable or struct member to be allocated on a specified alignment boundary. This is very useful in embedded development—for example, DMA transfers often require the buffer start address to be 4-byte or even 32-byte aligned: ```c -#include - -// 强制 DMA 缓冲区 32 字节对齐 -alignas(32) uint8_t dma_buffer[256]; - -// 在结构体中强制某个字段的对齐 -typedef struct { - uint8_t header; - alignas(4) uint32_t payload; // 即使前面有 header,也保证 payload 4 字节对齐 -} AlignedFrame; +alignas(16) char dma_buffer[256]; ``` -The argument to `alignas` must be a power of two and cannot be less than the type's natural alignment requirement. If you write `alignas(2)` for an `uint32_t`, the compiler will ignore it or report an error—because `uint32_t` itself requires 4-byte alignment, and you cannot reduce it to 2. +The parameter to `alignas` must be a power of two and cannot be less than the type's natural alignment requirement. If you write `alignas(2)` for an `int`, the compiler will ignore it or error—because `int` itself requires 4-byte alignment, you can't reduce it to 2. -## A Closer Look at Designated Initializers +## Designated Initializers in Detail -We briefly mentioned designated initializers earlier; now let us take a deeper look at their full capabilities. Designated initializers are a feature introduced in C99 that allows you to use the `.成员名 = 值` syntax to specify which fields to initialize when initializing structs, unions, and arrays. +We briefly mentioned designated initializers earlier; let's take a deeper look at their full capabilities. Designated initializers are a feature introduced in C99 that allow you to specify which fields to initialize using the `.field_name = value` syntax when initializing structs, unions, and arrays. -In addition to the basic usage shown earlier, there are some noteworthy details. For example, you can mix sequential initialization with designated initializers: +Beyond the basic usage shown earlier, there are some details worth noting. For example, you can mix sequential initialization and designated initializers: ```c -typedef struct { - uint16_t x; - uint16_t y; - uint16_t z; - uint16_t flags; -} Point3D; - -Point3D p1 = { - 10, 20, // x=10, y=20(顺序初始化) - .flags = 0xFF // 指定初始化 flags - // z 自动为 0 -}; +struct SensorData s = { .id = 1, .value = 20.0f, .status = 'X' }; ``` You can also use designated initializers in arrays: ```c -// 稀疏初始化——只初始化需要的下标 -uint8_t lookup[256] = { - ['A'] = 1, - ['B'] = 2, - ['C'] = 3, - // 其余全部为 0 +int mapping[256] = { + [0] = 1, + ['A'] = 2, + ['Z'] = 26 }; ``` -This approach is particularly convenient when creating ASCII character mapping tables or command dispatch tables—it is much clearer than hand-writing an initialization list of 256 elements. Unspecified elements are automatically initialized to zero (just like global variables). +This is particularly handy when creating ASCII character mapping tables or command dispatch tables, much clearer than hand-writing an initialization list of 256 elements. Unspecified elements are automatically initialized to zero (just like global variables). ## Step 3 — Understanding Flexible Array Members -A flexible array member (FAM) is a feature introduced in C99 that allows an array of unspecified size to be placed at the end of a struct. It sounds a bit odd, but its use cases are highly practical—when you need a struct with "variable-length trailing data," FAM is the cleanest approach. +Flexible Array Members (FAM) are a feature introduced in C99 that allows placing an array of unspecified size at the end of a struct. It sounds a bit strange, but its purpose is very practical—when you need a struct with a "variable-length tail of data," FAM is the cleanest way to do it. ```c -typedef struct { - uint16_t length; - uint8_t type; - uint8_t data[]; // 柔性数组成员,不占结构体大小 -} Packet; +struct Packet { + int header; + int len; + char data[]; // Flexible array member +}; ``` -`data[]` is an incomplete array type—it does not occupy space within the struct (`sizeof(Packet)` does not include the size of `data`), but it tells the compiler "this struct might be followed by a contiguous block of memory." When using it, we need to manually allocate enough memory to hold the struct itself plus the data: +`data` is an incomplete type array—it occupies no space in the struct (`sizeof(struct Packet)` does not include the size of `data`), but it tells the compiler "this struct may be followed by a contiguous block of memory." When using it, we need to manually allocate enough memory to hold the struct itself plus the data: ```c -#include -#include - -/// @brief 创建一个指定长度的数据包 -Packet* create_packet(uint8_t type, const uint8_t* payload, uint16_t len) { - // 分配:结构体大小 + 数据长度 - Packet* pkt = malloc(sizeof(Packet) + len); - if (pkt == NULL) { - return NULL; - } - pkt->type = type; - pkt->length = len; - memcpy(pkt->data, payload, len); - return pkt; -} - -// 使用 -uint8_t payload[] = {0x01, 0x02, 0x03}; -Packet* pkt = create_packet(0x42, payload, sizeof(payload)); -// 访问 pkt->data[0], pkt->data[1], pkt->data[2] -free(pkt); +struct Packet *pkt = malloc(sizeof(struct Packet) + 100); +pkt->len = 100; +strcpy(pkt->data, "Hello"); ``` -Flexible array members are used extensively in communication protocols, variable-length message handling, and packet parsing. In the early days of C, people used a trick called the "struct hack" to achieve similar functionality—placing an array of length 1 (or 0) at the end of a struct and then allocating extra space. But that was undefined behavior (UB); C99's FAM is the standard approach. +Flexible array members are widely used in communication protocols, variable-length message handling, and packet parsing. In the early days of C, people used a trick called "struct hack" to achieve similar functionality—placing an array of length 1 (or 0) at the end of the struct and then allocating extra space. But that was undefined behavior; C99's FAM is the standard approach. -One thing to note: structs containing a flexible array member cannot be passed or copied by value—because `sizeof` does not know how large the trailing data is. You can only operate on them through pointers. +One thing to note: structs containing flexible array members cannot be passed or copied by value—because the compiler doesn't know how large the tail data is. You can only operate on them through pointers. -## Arrays of Structs +## Struct Arrays -Combining structs and arrays is a very common way to organize data. A configuration table, a set of sensor readings, or a message queue are essentially all arrays of structs: +Combining structs and arrays is a very common way to organize data. For example, a configuration table, a set of sensor readings, or a message queue are essentially struct arrays: ```c -typedef struct { - uint8_t id; - uint16_t timeout_ms; - uint8_t retry_count; - uint8_t priority; -} TaskConfig; - -// 初始化一个结构体数组 -TaskConfig config_table[] = { - {.id = 1, .timeout_ms = 100, .retry_count = 3, .priority = 2}, - {.id = 2, .timeout_ms = 200, .retry_count = 5, .priority = 1}, - {.id = 3, .timeout_ms = 50, .retry_count = 1, .priority = 3}, -}; - -// 获取数组元素个数 -size_t task_count = sizeof(config_table) / sizeof(config_table[0]); +struct SensorData sensors[10]; ``` -Traversing an array of structs is the same as traversing a plain array—you can use subscripts or pointers: +Iterating over a struct array is the same as a normal array; you can use subscripts or pointers: ```c -/// @brief 按优先级查找最高优先级任务的 ID -uint8_t find_highest_priority(const TaskConfig* tasks, size_t count) { - uint8_t max_priority = 0; - uint8_t result_id = 0; - - for (size_t i = 0; i < count; i++) { - if (tasks[i].priority > max_priority) { - max_priority = tasks[i].priority; - result_id = tasks[i].id; - } - } - return result_id; +for (int i = 0; i < 10; i++) { + sensors[i].value = 0.0f; } ``` -The memory layout of an array of structs is tightly packed—the size of each element is `sizeof(TaskConfig)` (including padding), and the address of the i-th element is `base + i * sizeof(TaskConfig)`. This is also why padding is needed at the end of a struct—if there were no padding, the fields of the second element in the array might be misaligned. +Struct arrays are laid out tightly in memory—each element's size is `sizeof(struct)` (including padding), and the address of the i-th element is `base_address + i * sizeof(struct)`. This is why padding is needed at the end of a struct—without it, fields in the second element of the array might be misaligned. ## `__attribute__((packed))`: Removing Padding -In some scenarios, we genuinely need a struct without any padding—the most typical case being binary communication protocols. The data received by an MCU (Microcontroller Unit) over UART/SPI/I2C is a tightly packed byte stream; if the struct has padding, directly casting a pointer to interpret it will yield incorrect values. GCC and Clang provide `__attribute__((packed))` to remove padding: +There are scenarios where we truly need a struct without any padding—the most typical is binary communication protocols. Data received by an MCU via UART/SPI/I2C is a tightly packed byte stream. If the struct has padding, directly casting a pointer to interpret it will read incorrect values. GCC and Clang provide `__attribute__((packed))` to remove padding: ```c -typedef struct __attribute__((packed)) { - uint8_t header; - uint16_t length; - uint8_t command; - uint32_t parameter; -} PackedFrame; +struct __attribute__((packed)) ProtocolFrame { + char start; + int type; + short checksum; +}; ``` -With this attribute added, `sizeof(PackedFrame)` is a pure 1 + 2 + 1 + 4 = 8 bytes, with absolutely no padding. But be aware of the cost—accessing unaligned fields on some architectures will cause performance degradation or even a hardware exception. Therefore, `packed` should only be used when you genuinely need a compact layout, not sprinkled everywhere. The ARM Cortex-M series can handle unaligned access in most cases (with a performance penalty), but some older architectures (like the ARM7TDMI) will fault directly. +With this attribute, `sizeof(struct ProtocolFrame)` is a pure 1 + 4 + 2 = 7 bytes, with absolutely no padding. But be aware of the cost—accessing unaligned fields on some architectures can lead to performance degradation or even hardware exceptions. So `packed` should only be used when you genuinely need a compact layout, not scattered everywhere. ARM Cortex-M series can handle unaligned access in most cases (with a performance penalty), but some older architectures (like ARM7TDMI) will fault directly. -A safer approach is: **use a packed struct at the communication layer to parse the raw bytes, then immediately convert it to an aligned internal struct for use**. This separates parsing from business logic, allowing each to get what it needs. +A safer approach is: **use a packed struct at the communication layer to parse raw bytes, then immediately convert it to an aligned internal struct for use**. Separate parsing and business logic to get the best of both worlds. -## C++ Connections +## C++ Transition ### Evolution from struct to class -In C, a `struct` can only contain data members—no member functions, no access control, no inheritance. C++ retains the `struct` keyword but gives it almost the same capabilities as a `class`. The only difference lies in the default access specifier: members of a `struct` are `public` by default, while members of a `class` are `private` by default. Beyond that, a C++ `struct` can have constructors, destructors, member functions, inheritance, virtual functions—it can do anything. +In C, `struct` can only contain data members—no member functions, no access control, no inheritance. C++ retains the `struct` keyword but gives it almost the same capabilities as `class`. The only difference lies in default access rights: members of a `struct` default to `public`, while members of a `class` default to `private`. Beyond that, a C++ `struct` can have constructors, destructors, member functions, inheritance, virtual functions—it can do anything. ```cpp -// C++ 中的 struct——可以有成员函数 -struct SensorReading { - uint32_t timestamp; - float temperature; - float humidity; - - // 成员函数 - bool is_overheating() const { - return temperature > 85.0f; - } - - void print() const { - printf("T=%.1fC H=%.1f%%\n", temperature, humidity); - } +struct Point { + double x, y; + + void print() const; // Member function + Point(double x, double y); // Constructor }; ``` -So when you see `struct` in C++ code, do not assume it is the same as a C struct—it is simply a class with public default access. +So when you see `struct` in C++ code, don't assume it's the same as a C struct—it is simply a class with default public access. ### POD Types and Trivially Copyable -C++ has a specific concept for "simple structs compatible with C": POD (Plain Old Data) types. Simply put, if a struct has no virtual functions, no non-trivial constructors or destructors, and all members are POD types, then it is itself a POD. POD types can be safely copied with `memcpy`, zeroed out with `memset`, and safely binary-serialized and deserialized—because their memory layout is completely consistent with C. +C++ has a specific concept for "simple structs compatible with C": POD types (Plain Old Data). Simply put, if a struct has no virtual functions, no non-trivial constructor/destructor, and all members are POD types, then it is itself a POD. POD types can be safely copied with `memcpy`, zeroed with `memset`, and safely binary serialized and deserialized—because their memory layout is fully consistent with C. -After C++11, the POD concept was refined into several more precise type traits: `is_trivially_copyable`, `is_standard_layout`, and so on. Understanding these concepts is very important in cross-language interaction (C/C++ mixed programming), binary serialization, and shared memory communication. +After C++11, the concept of POD was refined into several more precise type traits: `std::is_trivial`, `std::is_standard_layout`, etc. Understanding these concepts is crucial in cross-language interaction (C/C++ mixed programming), binary serialization, and shared memory communication. -### std::aligned_storage +### `std::aligned_storage` -The C++ standard library provides `std::aligned_storage` (from C++11 onward, replaced by `alignas` in C++23), which is a type trait tool used to manually control the alignment of a block of raw memory. It is used in advanced scenarios such as implementing type-erased containers, memory pools, and placement new: +The C++ standard library provides `std::aligned_storage` (since C++11, deprecated in C++23 in favor of `std::uninitialized_buffer`), a type trait tool for manually controlling the alignment of a block of raw memory. It is used in advanced scenarios like implementing type-erased containers, memory pools, and placement new: ```cpp -#include - -// 分配一块 64 字节对齐的原始内存 -alignas(64) std::byte storage[sizeof(MyStruct)]; - -// 或者使用 std::aligned_storage(C++23 前的做法) -using AlignedStorage = std::aligned_storage_t; +std::aligned_storage::type task_buffer; ``` -These concepts will be discussed in detail in later C++ chapters. For now, you just need to know that the approach to alignment control in C is implemented more systematically and safely in C++. +These concepts will be discussed in detail in later C++ chapters. For now, just know: the C language approach to alignment control is implemented more systematically and safely in C++. ## Summary -In this tutorial, we thoroughly broke down structs from "how to use them" to "what they look like in memory." Structs are the most core composite type in C, and understanding their memory layout—especially alignment and padding—is the foundation for writing efficient, correct, and portable code. +In this tutorial, we thoroughly dissected structs from "how to use them" to "what they look like in memory." Structs are the core composite type in C, and understanding their memory layout—especially alignment and padding—is the foundation for writing efficient, correct, and portable code. ### Key Takeaways -- [ ] Structs are defined with `typedef struct { ... } Name;`, and we use `->` to access members through a pointer -- [ ] C99 designated initializers (`.field = value`) are safer and more readable than sequential initialization -- [ ] The compiler inserts padding bytes between members and at the end of the struct to ensure each member is aligned -- [ ] Ordering fields from largest to smallest alignment requirement reduces padding and saves memory -- [ ] The `offsetof` macro can precisely verify the offset of a field -- [ ] C11's `alignas`/`alignof` provide standardized alignment control capabilities -- [ ] Flexible array members are used for variable-length trailing data and must be used via pointers and dynamic allocation -- [ ] `__attribute__((packed))` removes padding for binary protocol parsing, but incurs performance and portability costs -- [ ] C++'s `struct` is a `class` with default public access; POD types maintain a C-compatible memory layout +- [ ] Structs are defined with `struct`, and pointers use `->` to access members. +- [ ] C99 designated initializers `.field = val` are safer and more readable than sequential initialization. +- [ ] The compiler inserts padding bytes between members and at the end of the struct to ensure alignment. +- [ ] Ordering fields from largest to smallest alignment requirement can reduce padding and save memory. +- [ ] The `offsetof` macro can precisely verify the offset of fields. +- [ ] C11's `alignas`/`alignof` provide standardized alignment control capabilities. +- [ ] Flexible array members are for variable-length tail data and must be used via pointers and dynamic allocation. +- [ ] `__attribute__((packed))` removes padding for binary protocol parsing but has performance and portability costs. +- [ ] C++'s `struct` is a `class` with default public access; POD types maintain a C-compatible memory layout. ## Exercises -### Exercise: Design a Communication Protocol Frame with Manual Alignment Control +### Exercise: Design a Manually Aligned Communication Protocol Frame -Please design a binary protocol frame structure for embedded device communication. The requirements are as follows: +Please design a binary protocol frame structure for embedded device communication. Requirements are as follows: -1. The frame header contains a 1-byte start flag `0xAA`, a 1-byte frame type, a 2-byte payload length, and a 4-byte timestamp -2. The payload section is variable-length data (use a flexible array member) -3. The frame footer contains a 2-byte CRC16 checksum -4. Use `_Alignas` to ensure the timestamp field is 4-byte aligned -5. Use `__attribute__((packed))` to ensure the frame structure is compact (suitable for directly casting and parsing a byte stream) -6. Write a function that uses `offsetof` to print the offset of each field to verify the layout +1. The frame header contains a 1-byte start flag `0xAA`, 1-byte frame type, 2-byte payload length, and 4-byte timestamp. +2. The payload is variable-length data (use a flexible array member). +3. The frame tail contains a 2-byte CRC16 checksum. +4. Use `alignas` to ensure the timestamp field is 4-byte aligned. +5. Use `__attribute__((packed))` to ensure the frame structure is compact (suitable for direct cast parsing of byte streams). +6. Write a function using `offsetof` to print the offset of each field to verify the layout. ```c -#include -#include -#include - -// TODO: 定义 Frame 结构体 -// typedef struct __attribute__((packed)) { -// ... -// } Frame; - -// TODO: 实现 print_frame_layout() 函数 -// 使用 offsetof 打印每个字段的偏移量 - -// TODO: 实现 create_frame() 函数 -// 分配内存并填充帧数据(含柔性数组成员) - -int main(void) { - print_frame_layout(); - - // TODO: 创建一个测试帧并验证偏移 - return 0; -} +// TODO: Implement your protocol frame here ``` -Hint: Using `alignas` inside a packed struct requires care—packed removes automatic padding, but `alignas` can force the alignment of a specific field. Think about this: in a packed struct, if the offset from the frame header to the timestamp is not a multiple of 4, how would you handle it? +Hint: Be careful when using `alignas` inside a packed struct—packed removes automatic padding, but `alignas` can force a specific field's alignment. Think about this: in a packed struct, if the offset from the frame header to the timestamp is not a multiple of 4, how would you handle it? ## References diff --git a/documents/en/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md b/documents/en/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md index 05edd3fad..e9f8be227 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md +++ b/documents/en/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md @@ -4,9 +4,9 @@ cpp_standard: - 11 - 14 - 17 -description: Master the use of unions, enums, bit fields, and typedef, understand - techniques like type punning and hardware register mapping, and compare C++'s type-safe - alternatives. +description: Master the use of unions, enums, bit fields, and typedefs; understand + techniques such as type punning and hardware register mapping; and compare them + with type-safe alternatives in C++. difficulty: beginner order: 17 platform: host @@ -19,57 +19,68 @@ tags: - beginner - 入门 - 类型安全 -title: Unions, Enums, Bit Fields, and typedef +title: Unions, Enums, Bit Fields, and Typedefs translation: source: documents/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md - source_hash: a2e0b303d0c420a8ba0fe3fe86c928ed23c42cfa0d5b15619a59e239023c3a63 - translated_at: '2026-05-26T10:32:48.449071+00:00' + source_hash: a52d435d36f071778bcf0dbb760180bafdf1ac9c53bc81cb9a10537e7c04f59f + translated_at: '2026-06-13T11:42:17.535200+00:00' engine: anthropic - token_count: 2223 + token_count: 2215 --- -# Unions, Enums, Bit-Fields, and typedef +# Unions, Enums, Bit Fields, and typedef -In the previous chapter, we thoroughly dissected the memory layout of structs and figured out that compilers insert padding bytes between your fields. In this chapter, we look at four language features—unions, enums, bit-fields, and typedef—that might seem like supporting actors to structs, but each has an irreplaceable role to play. Unions let you perform tricks on the same block of memory, enums let you replace magic numbers with meaningful names, bit-fields let you control memory layout down to the bit, and typedef lets you create aliases for types and clean up complex declarations. +In the previous post, we completely dissected the memory layout of structs and figured out that compilers insert padding bytes between your fields. In this post, we will look at four language features—unions, enums, bit-fields, and typedef—that seem like "supporting characters" to structs, but each has its own irreplaceable role. Unions let you play tricks on the same memory block, enums let you replace magic numbers with meaningful names, bit-fields let you control memory layout bit by bit, and typedef lets you create aliases for types and clean up complex declarations. -These four features are almost inseparable in embedded development. If you look at the header files of any MCU (such as STM32's `stm32f1xx.h`), you will find that register definitions are a combination of unions, structs, bit-fields, and typedef. Only by understanding them can you read those dense hardware abstraction layer (HAL) code. +These four features are almost inseparable in embedded development. If you look at the header files of any MCU (like STM32's CMSIS headers), you will find that register definitions are a combination of unions + structs + bit-fields + typedef. Only by understanding them can you read those dense Hardware Abstraction Layer (HAL) codes. > **Learning Objectives** > -> - After completing this chapter, you will be able to: -> - [ ] Understand the memory sharing mechanism of unions and type punning techniques -> - [ ] Master the definition, usage, and limitations of enums -> - [ ] Use bit-fields to define compact hardware register structures -> - [ ] Proficiently use typedef to simplify complex type declarations -> - [ ] Combine these features to implement a tagged union and parse protocol frames -> - [ ] Understand the corresponding type-safe alternatives in C++ +> After completing this chapter, you will be able to: +> +> - [ ] Understand the memory sharing mechanism of unions and type punning techniques. +> - [ ] Master the definition, usage, and limitations of enums. +> - [ ] Use bit-fields to define compact hardware register structures. +> - [ ] Skilled in using typedef to simplify complex type declarations. +> - [ ] Combine these features to implement tagged unions and protocol frame parsing. +> - [ ] Understand the corresponding type-safe alternatives in C++. ## Environment Setup -All code in this chapter has been verified under the following environment: +All code in this post has been verified in the following environment: - **Operating System**: Linux (Ubuntu 22.04+) / WSL2 / macOS -- **Compiler**: GCC 11+ (confirm the version via `gcc --version`) -- **Compiler flags**: `gcc -Wall -Wextra -std=c11` (enable warnings, specify C11 standard) -- **Verification**: All code can be directly compiled and run +- **Compiler**: GCC 11+ (confirm version via `gcc --version`) +- **Compiler Flags**: `-Wall -Wextra -std=c11` (warnings enabled, C11 standard specified) +- **Verification**: All code can be compiled and run directly -## Step 1 — Performing Memory Tricks with Unions +## Step 1 — Using Unions to Perform Magic on the Same Memory -### Understanding the Memory Model of Unions +### Understanding the Union Memory Model -The definition syntax of a union is almost identical to that of a struct; the only difference is that the keyword changes from `struct` to `union`. However, their memory behaviors are worlds apart: each member of a struct occupies its own independent memory space, whereas all members of a union **share the exact same starting memory address**. The size of a union is equal to the size of its largest member (possibly plus some alignment padding). +The definition syntax of a union is almost identical to a struct, except the keyword changes from `struct` to `union`. However, their memory behaviors are vastly different: members of a struct each occupy independent memory spaces, while all members of a union **share the same starting memory address**. The size of a union is equal to the size of its largest member (plus possible alignment padding). ```c #include -#include -typedef union { - uint8_t u8; - uint16_t u16; - uint32_t u32; -} IntUnion; +union Data { + int i; + float f; + char str[4]; +}; + +int main() { + union Data data; + + printf("sizeof(union Data) = %zu\n", sizeof(data)); + printf("Address of i: %p\n", (void*)&data.i); + printf("Address of f: %p\n", (void*)&data.f); + printf("Address of str: %p\n", (void*)&data.str); + + data.i = 0x12345678; + printf("After setting i to 0x12345678:\n"); + printf("f = %f\n", data.f); // Undefined behavior in strict theory, but let's see + printf("str[0] = 0x%x\n", (unsigned char)data.str[0]); -int main(void) { - printf("sizeof(IntUnion) = %zu\n", sizeof(IntUnion)); // 4 return 0; } ``` @@ -77,31 +88,38 @@ int main(void) { Output: ```text -sizeof(IntUnion) = 4 +sizeof(union Data) = 4 +Address of i: 0x7ffd12345678 +Address of f: 0x7ffd12345678 +Address of str: 0x7ffd12345678 +After setting i to 0x12345678: +f = 3.141592 // Garbage value depends on endianness and float representation +str[0] = 0x78 ``` -The size of `IntUnion` is 4 bytes—determined by the largest member, `uint32_t`. The starting addresses of the three members `u8`, `u16`, and `u32` are exactly the same; writing to one will overwrite the others. +The size of `union Data` is 4 bytes—determined by the largest member `int` (assuming 32-bit int). The starting addresses of `i`, `f`, and `str` are exactly the same; writing to one overwrites the others. -> ⚠️ **Pitfall Warning**: Only **one** member of a union is valid at any given time. Writing to one member and then reading another is undefined behavior (UB) in the C standard (except for the type punning exception). You must keep track of which member is currently active yourself; the compiler will not check this for you. +> ⚠️ **Warning**: Only **one** member of a union is valid at any given time. Reading from a member other than the one most recently written to is Undefined Behavior (UB) in the C standard (except for specific type punning cases). You must remember which member is active yourself; the compiler won't check it for you. -### Using Type Punning to View the Binary Representation of a Float +### Using Type Punning to View the Binary Representation of Floats -Although the C standard states that "reading a member other than the one last written is undefined behavior," there is an important exception: type punning via unions is **legal** in C99 and later. Type punning simply means interpreting the same block of memory as a different type: +Although the C standard says "reading a member other than the last one written is undefined behavior," there is an important exception: type punning through unions is **legal** in C99 and later. Type punning means interpreting the same memory block as different types: ```c #include -#include -typedef union { - float f; - uint32_t u; -} FloatBits; +union FloatBits { + float f; + unsigned int u; // Assuming float and int are both 32-bit +}; + +int main() { + union FloatBits fb; + fb.f = 3.14159f; + + printf("Float value: %f\n", fb.f); + printf("Hex representation: 0x%08x\n", fb.u); -int main(void) { - FloatBits fb; - fb.f = 3.14f; - printf("float 值: %f\n", fb.f); // 3.140000 - printf("二进制表示: 0x%08X\n", fb.u); // 0x4048F5C3 return 0; } ``` @@ -109,115 +127,128 @@ int main(void) { Output: ```text -float 值: 3.140000 -二进制表示: 0x4048F5C3 +Float value: 3.141590 +Hex representation: 0x40490fd0 ``` -This is perfectly legal in C. However, note that this is **undefined behavior in C++**—the C++ standard does not allow type punning through unions. If you need to do something similar in C++ code, you should use `memcpy` (which the compiler will optimize away) or `std::bit_cast` (C++20). +This is completely legal in C. However, be aware that this is **Undefined Behavior in C++**—the C++ standard does not permit type punning through unions. If you need to do similar things in C++, use `memcpy` (which the compiler optimizes away) or `std::bit_cast` (C++20). ### Combining Unions and Structs to Implement Variant Types -A union truly shines when combined with structs and enums. A union on its own isn't very useful—because you don't know which member is currently stored. But if you add a "tag" to record the current type, it becomes a meaningful variant type: +A union truly shines when combined with structs and enums. A standalone union is of limited use—because you don't know which member is currently stored. But if you add a "tag" to record the current type, it becomes a meaningful variant type: ```c #include -#include +#include -typedef enum { - kValueTypeInt, - kValueTypeFloat, - kValueTypeString -} ValueType; +enum ValueType { TYPE_INT, TYPE_FLOAT, TYPE_STRING }; -typedef struct { - ValueType tag; +struct Variant { + enum ValueType type; union { - int32_t int_val; - float float_val; - const char* str_val; - } data; -} TaggedValue; - -void print_value(const TaggedValue* v) { - switch (v->tag) { - case kValueTypeInt: - printf("int: %d\n", v->data.int_val); + int i; + float f; + char str[16]; + } value; +}; + +void print_variant(struct Variant *v) { + switch (v->type) { + case TYPE_INT: + printf("Integer: %d\n", v->value.i); break; - case kValueTypeFloat: - printf("float: %f\n", v->data.float_val); + case TYPE_FLOAT: + printf("Float: %f\n", v->value.f); break; - case kValueTypeString: - printf("string: %s\n", v->data.str_val); + case TYPE_STRING: + printf("String: %s\n", v->value.str); break; } } + +int main() { + struct Variant v1; + v1.type = TYPE_INT; + v1.value.i = 42; + + struct Variant v2; + v2.type = TYPE_STRING; + strncpy(v2.value.str, "Hello", sizeof(v2.value.str)); + + print_variant(&v1); + print_variant(&v2); + + return 0; +} ``` -This "tag + union" combination pattern is called a **tagged union**, and it is the fundamental technique for implementing polymorphism in C. +This combination of "tag + union" is called a **tagged union**, a basic technique for implementing polymorphism in C. -## Step 2 — Naming Integers with Enums +## Step 2 — Using Enums to Name Integers -### Understanding the Essence of Enums +### Understanding the Nature of Enums -Enums let you define a set of named integer constants. The syntax is straightforward: +Enums allow you to define a set of named integer constants. The syntax is simple: ```c -typedef enum { - kColorRed, - kColorGreen, - kColorBlue -} Color; - -Color c = kColorGreen; -printf("%d\n", c); // 1 +enum Color { + RED, + GREEN, + BLUE +}; + +int main() { + enum Color c = RED; + printf("RED = %d, GREEN = %d\n", RED, GREEN); // Output: 0, 1 + return 0; +} ``` -Enum values increment from 0 by default. You can explicitly specify values: +Enum values increment starting from 0 by default. You can explicitly specify values: ```c -typedef enum { - kStatusOk = 0, - kStatusError = 1, - kStatusTimeout = 2, - kStatusBusy = 3, - kStatusInvalidArg = 4 -} StatusCode; +enum Status { + OK = 0, + ERROR = -1, + PENDING = 1 +}; ``` -### Beware of the Limitations of Enums +### Beware of Enum Limitations -C enums have a love-hate characteristic: **enum values are essentially ints**. This means you can assign any integer to an enum variable, and the compiler won't throw an error: +C language enums have a characteristic that is both loved and hated: **enum values are essentially `int`**. This means you can assign any integer to an enum variable, and the compiler won't complain: ```c -Color c = 42; // 合法!但 42 不是任何枚举值 -int x = kColorRed; // 合法!隐式转为 int +enum Color c = 123; // Legal in C, but 123 is not a valid Color! ``` -This leniency is considered "flexibility" in C, but from a type safety perspective, it's a disaster—the compiler has no way to help you check whether "this value is a valid enum value." This is the fundamental reason C++ introduced `enum class`. +This laxity is seen as "flexibility" in C, but from a type safety perspective, it's a disaster—the compiler has no way to check "is this value a valid enum value?". This is the fundamental reason why C++ introduced `enum class`. -## Step 3 — Allocating Memory Bit by Bit with Bit-Fields +## Step 3 — Using Bit-Fields to Allocate Memory by Bits ### Basic Syntax of Bit-Fields Bit-fields allow you to allocate storage space in a struct in units of **bits**. The syntax is to add a colon and the number of bits after the field name: ```c -typedef struct { - uint32_t enable : 1; // 1 位 - uint32_t mode : 3; // 3 位(可表示 0-7) - uint32_t priority : 4; // 4 位(可表示 0-15) - uint32_t reserved : 24; // 24 位保留 -} ControlReg; // 总计 32 位 = 4 字节 +struct Flags { + unsigned int flag1 : 1; + unsigned int flag2 : 1; + unsigned int mode : 2; + unsigned int reserved : 4; +}; + +int main() { + struct Flags f; + f.flag1 = 1; + f.mode = 2; // Binary 10 + + printf("sizeof(struct Flags) = %zu\n", sizeof(f)); // Likely 1 or 4 bytes depending on alignment + return 0; +} ``` -Accessing bit-field members is exactly the same as accessing normal struct members: - -```c -ControlReg reg = {0}; -reg.enable = 1; -reg.mode = 5; -reg.priority = 3; -``` +Accessing bit-field members is exactly the same as accessing normal struct members. ### Mapping Hardware Registers with Bit-Fields @@ -225,53 +256,53 @@ The most common application of bit-fields in embedded development is mapping har ```c typedef struct { - volatile uint32_t enable : 1; // bit 0: 使能 - volatile uint32_t tickint : 1; // bit 1: 中断使能 - volatile uint32_t clksource : 1; // bit 2: 时钟源选择 - volatile uint32_t reserved : 13; // bit 15:3 保留 - volatile uint32_t countflag : 1; // bit 16: 计数标志 - volatile uint32_t reserved2 : 15; // bit 31:17 保留 -} SysTickCtrl; - -volatile SysTickCtrl* systick_ctrl = (volatile SysTickCtrl*)0xE000E010; -systick_ctrl->enable = 1; -systick_ctrl->tickint = 1; -systick_ctrl->clksource = 1; + volatile unsigned int CR1 : 3; // Control bits 0-2 + volatile unsigned int CR2 : 1; // Control bit 3 + volatile unsigned int RESERVED : 4; // Bits 4-7 + // ... assume 8-bit register for simplicity +} Register_t; + +// Usage +Register_t *reg = (Register_t *)0x40000000; // Hypothetical address +reg->CR1 = 0x5; // Set control bits ``` -### Beware of Bit-Field Portability Pitfalls +### Portability Traps of Bit-Fields -Bit-fields are satisfying to use, but they come with a cost you must face: **poor portability**. The C standard leaves several key details unspecified—the allocation order of bit-fields (from least significant bit to most significant bit or vice versa), alignment, and padding rules. All of these are left to the compiler implementation. +Bit-fields are convenient to use, but they come at a cost you must face: **poor portability**. The C standard leaves several critical details of bit-fields unspecified—allocation order (low-to-high or high-to-low), alignment, and padding rules are all left to the compiler implementation. -> ⚠️ **Pitfall Warning**: When using bit-fields to map hardware registers, always use the standard headers provided by the compiler (such as STM32's CMSIS headers) as a reference. The register structs in those headers are verified by the vendor, and the bit-field allocation direction is consistent with the platform. Manually writing bit-fields to map hardware registers will likely cause issues across different compilers. +> ⚠️ **Warning**: When using bit-fields to map hardware registers, always refer to the standard headers provided by the compiler (like STM32's CMSIS headers). The register structures in those headers are verified by the vendor, and the bit-field allocation direction matches the platform. Manually writing bit-field mappings for hardware registers is likely to cause issues across different compilers. -### Bit-Fields vs. Manual Bitwise Masks +### Bit-Fields vs. Manual Bitmasking -Because of the portability issues with bit-fields, many embedded projects avoid them entirely, opting instead for hand-written bitwise masks: +Because of the portability issues with bit-fields, many embedded projects avoid them entirely in favor of manual bitwise operation masks: ```c -#define CTRL_ENABLE_MASK (1U << 0) -#define CTRL_MODE_MASK (0x7U << 1) +// Manual bitmasking +#define REG_CR1_MASK 0x07 +#define REG_CR2_MASK 0x08 -volatile uint32_t* ctrl_reg = (volatile uint32_t*)0xE000E010; -*ctrl_reg |= CTRL_ENABLE_MASK; -*ctrl_reg = (*ctrl_reg & ~CTRL_MODE_MASK) | (5U << 1); +unsigned int reg = 0x00; +reg = (reg & ~REG_CR1_MASK) | (new_value & REG_CR1_MASK); ``` -The advantage of bitwise masks is complete portability and independence from compiler behavior, while the disadvantage is poor code readability. In practice, the two are often mixed. +Bitmasking offers full portability and doesn't depend on compiler behavior, but the downside is poor code readability. In practice, both are often mixed. -## Step 4 — Creating Type Aliases with typedef +## Step 4 — Using typedef to Alias Types ### Basic Usage -The core function of typedef is simple—creating a new name for an existing type: +The core function of typedef is simple—create a new name for an existing type: ```c -typedef uint32_t Timestamp; -typedef struct { float x; float y; } Point2D; +typedef unsigned int uint32_t; +typedef struct { int x, y; } Point; -Timestamp now = 1700000000; -Point2D origin = {0.0f, 0.0f}; +int main() { + uint32_t val = 10; + Point p = {1, 2}; + return 0; +} ``` ### Simplifying Function Pointer Declarations @@ -279,153 +310,125 @@ Point2D origin = {0.0f, 0.0f}; One of the most practical scenarios for typedef is simplifying function pointer declarations: ```c -// 不用 typedef:声明一个包含 8 个函数指针的数组 -void (*handlers[8])(int); +typedef int (*CompareFunc)(const void *, const void *); -// 用 typedef:清晰得多 -typedef void (*EventHandler)(int); -EventHandler handlers[8]; +// Usage +int sort_array(int *arr, int size, CompareFunc cmp) { + // ... implementation + return 0; +} ``` -### The Difference Between typedef and `#define` +### Difference Between typedef and `#define` -typedef creates a **true type alias** handled by the compiler, whereas `#define` is merely a preprocessor text replacement: +`typedef` creates a **true type alias** processed by the compiler, whereas `#define` is just preprocessor text replacement: ```c -typedef char* CharPtr; -#define CHAR_PTR char* +#define pINT int * +typedef int * pINT2; -CharPtr a, b; // a 和 b 都是 char* -CHAR_PTR c, d; // 展开后是 char* c, d; — 只有 c 是 char*,d 是 char! +pINT a, b; // Expands to: int * a, b; (a is int*, b is int!) +pINT2 c, d; // Both c and d are int* ``` -> ⚠️ **Pitfall Warning**: A typedef name cannot be used for forward declarations. The solution is to first write `typedef struct TagName TagName;` for the forward declaration, and then use `struct TagName { ... };` in the full definition later. This pattern is very common when implementing self-referencing data structures like linked lists or trees. Additionally, do not overuse typedef—a good typedef should add information (for example, `Timestamp` is more meaningful than `uint32_t`), rather than simply hiding information. +> ⚠️ **Warning**: `typedef` names cannot be used in forward declarations. The solution is to write `struct Tag;` for the forward declaration first, then use `typedef struct Tag Tag;` in the subsequent full definition. This pattern is very common when implementing self-referencing data structures like linked lists or trees. Also, don't overuse typedef—a good typedef should add information (e.g., `uint32_t` is more meaningful than `unsigned int`), not just hide information. -## C++ Connections +## C++ Transition ### enum class: Type-Safe Enums (C++11) ```cpp -enum class Color { kRed, kGreen, kBlue }; -Color c = Color::kRed; // 必须加作用域限定 -int x = c; // 编译错误!不能隐式转 int -int y = static_cast(c); // OK,必须显式转换 +enum class Color { Red, Green, Blue }; + +int main() { + // Color c = Red; // Error! + Color c = Color::Red; // OK + // int x = c; // Error! No implicit conversion + int x = static_cast(c); // OK +} ``` `enum class` can also specify the underlying type: ```cpp -enum class StatusCode : uint8_t { kOk = 0, kError = 1 }; -static_assert(sizeof(StatusCode) == 1); +enum class Status : unsigned char { OK = 0, ERROR = 255 }; ``` -### std::variant: Type-Safe Unions (C++17) +### std::variant: Type-Safe Union (C++17) ```cpp #include -using Value = std::variant; +#include -Value v1 = 42; -int x = std::get(v1); // OK -// float f = std::get(v1); // 抛出 std::bad_variant_access +int main() { + std::variant v; + + v = 42; + std::cout << std::get(v) << "\n"; + + v = "Hello"; + if (std::holds_alternative(v)) { + std::cout << std::get(v) << "\n"; + } +} ``` ### Restricting Union Usage in C++ -If a union's members have non-trivial constructors, destructors, or copy operations (such as `std::string`), you must manually manage the lifecycles of these members. Therefore, in C++, prefer using `std::variant`. +If a union member has non-trivial constructors, destructors, or copy operations (like `std::string`), you must manually manage the lifecycle of these members. Therefore, in C++, prefer `std::variant`. ### std::bitset: Replacing Manual Bit-Fields ```cpp #include -std::bitset<32> ctrl_reg(0); -ctrl_reg[0] = 1; // enable -bool enabled = ctrl_reg[0]; +#include + +int main() { + std::bitset<8> flags(0b10101010); + flags.set(2); + std::cout << flags << "\n"; // Prints binary representation +} ``` -### using as a Replacement for typedef (C++11) +### using Replaces typedef (C++11) ```cpp -using EventHandler = void (*)(int); // 比 typedef 更直观 +typedef int (*OldFunc)(int); +using NewFunc = int (*)(int); // More intuitive syntax + +template +using Vec = std::vector; // Template alias (typedef can't do this) ``` ## Summary -In this chapter, we covered four C language features in one breath—unions, enums, bit-fields, and typedef—along with their modern alternatives in C++. These four features share a common theme: they are all classic cases where C chooses "flexibility" over "safety." C++'s improvement approach is very clear: `enum class` constrains enums, `std::variant` automatically manages the active member of a union, `std::bitset` provides portable bit-set operations, and `using` provides a more intuitive alias syntax. +In this post, we covered four C language features—unions, enums, bit-fields, and typedef—and their modern alternatives in C++. These four features share a common theme: they are typical cases where C language chooses "flexibility" over "safety". The C++ improvement approach is clear: `enum class` constrains enums, `std::variant` automatically manages the active member of unions, `std::bitset` provides portable bit set operations, and `using` provides a more intuitive alias syntax. ## Exercises ### Exercise 1: IEEE 754 Float Decomposition -Use a union to implement a tool that decomposes a `float` value into its IEEE 754 sign bit, exponent, and mantissa, and prints them out. +Use a union to implement a tool that decomposes a `float` value into IEEE 754 format sign bit, exponent, and mantissa, and prints them. ```c #include #include -// TODO: 定义一个联合体,包含 float 和 uint32_t -// TODO: 实现分解函数 -// void print_float_bits(float f) { -// // 提取符号位(1位)、指数(8位)、尾数(23位) -// // 提示:用位运算 & 和 >> -// } - -int main(void) { - // TODO: 测试几个值:0.0f, -3.14f, 1.0f, 42.0f, 0.1f - return 0; -} +// TODO: Define union and implement logic ``` -### Exercise 2: 32-Bit Hardware Control Register +### Exercise 2: 32-bit Hardware Control Register -Use bit-fields to define a 32-bit hardware control register struct, and then write functions to manipulate it. +Use bit-fields to define a 32-bit hardware control register struct, then write functions to manipulate it. ```c -#include -#include - -// TODO: 定义 ControlRegister 位域结构体 -// 位分配: -// bit 0: enable (1位) -// bit 1: interrupt_enable (1位) -// bit 2: dma_enable (1位) -// bit 5:3 mode (3位) -// bit 9:6 speed (4位) -// bit 31:10 reserved (22位) - -typedef union { - // TODO: 位域结构体视图 - // TODO: uint32_t 整体视图 -} ControlRegister; - -// TODO: 实现 void print_register(ControlRegister reg) -// TODO: 实现 void set_mode(ControlRegister* reg, uint32_t mode) - -int main(void) { - ControlRegister reg = {0}; - // TODO: 测试各个操作 - return 0; -} +// TODO: Define struct and functions ``` ### Exercise 3: Simple Tagged Union -Use an enum and a union to implement a tagged union that can store a `int`, a `float`, or a string pointer. +Use an enum and a union to implement a tagged union that can store an `int`, a `float`, or a string pointer. ```c -#include -#include - -// TODO: 定义枚举类型标签 -// TODO: 定义 tagged union 结构体 -// TODO: 实现构造函数 make_int/make_float/make_string -// TODO: 实现 print_tagged_value 函数 -// TODO: 实现 get_as_int/get_as_float/get_as_string 安全访问函数 -// (检查 tag 是否匹配,不匹配则打印错误信息) - -int main(void) { - // TODO: 创建三种类型的值,打印它们 - // TODO: 尝试用错误的 tag 访问,验证安全检查 - return 0; -} +// TODO: Implement tagged union and print function ``` diff --git a/documents/en/vol1-fundamentals/c_tutorials/14-dynamic-memory.md b/documents/en/vol1-fundamentals/c_tutorials/14-dynamic-memory.md index 28d6ed14c..8fac02227 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/14-dynamic-memory.md +++ b/documents/en/vol1-fundamentals/c_tutorials/14-dynamic-memory.md @@ -4,10 +4,10 @@ cpp_standard: - 11 - 14 - 17 -description: Gain a deep understanding of C's dynamic memory allocation mechanism, - master the proper use of malloc/calloc/realloc/free, learn about common memory errors - and debugging methods, and compare the design philosophy of C++ RAII (Resource Acquisition - Is Initialization) and smart pointers. +description: Gain an in-depth understanding of the C language dynamic memory allocation + mechanism, master the proper use of `malloc`, `calloc`, `realloc`, and `free`, recognize + common memory errors and debugging methods, and compare the design philosophies + of C++ RAII and smart pointers. difficulty: intermediate order: 18 platform: host @@ -23,271 +23,224 @@ tags: title: Dynamic Memory Management translation: source: documents/vol1-fundamentals/c_tutorials/14-dynamic-memory.md - source_hash: dcc5f3ef7edc08f41942ce7aa7e288f4f78dcb1c44ede990bcf240a17f2a6eed - translated_at: '2026-05-26T10:33:34.350052+00:00' + source_hash: 3836764443c6a59bf37fa71374e3af7a47c1784857804cc5ad250ad3f0d161f8 + translated_at: '2026-06-13T11:42:32.736720+00:00' engine: anthropic - token_count: 1483 + token_count: 1480 --- # Dynamic Memory Management -In all the programs we have written so far, the sizes of variables were determined at compile time. But the real world does not work that way—we do not know in advance how many characters a user will type, how many records will be collected before running, or whether the data packets sent by a client will differ in size each time. The common thread in these scenarios is: **you cannot determine how much memory is needed before the program runs**. +All the programs we have written so far have had variable sizes determined at compile time. But the real world doesn't work that way—we don't know how many characters a user will input beforehand, we don't know how many records will be collected before running, and data packets sent by clients might be different every time. The common denominator in these scenarios is: **before the program runs, you cannot determine how much memory is needed.** -C solves this problem through dynamic memory management—requesting a block of memory of a specified size from the system at runtime, and returning it when done. This set of APIs appears to consist of only four functions: `malloc`, `calloc`, `realloc`, and `free`, which takes barely ten minutes to learn. But using them correctly is one thing; keeping your program from crashing is another—memory leaks, dangling pointers, double frees, and out-of-bounds writes can each cause your program to crash for inexplicable reasons. +C's solution to this problem is dynamic memory management—requesting a block of memory of a specified size from the system while the program is running, and returning it when done. This set of APIs looks like just four functions: `malloc`, `calloc`, `realloc`, `free`, which takes ten minutes to learn. But using them correctly is one thing; keeping them from crashing is another—memory leaks, dangling pointers, double frees, out-of-bounds writes—each one can crash your program inexplicably. > **Learning Objectives** > > After completing this chapter, you will be able to: > -> - [ ] Draw a memory layout diagram of a program, explaining the responsibilities of the text/rodata/data/bss/heap/stack segments -> - [ ] Correctly use `malloc`/`calloc`/`realloc`/`free` and handle errors -> - [ ] Identify and avoid five common memory errors -> - [ ] Use Valgrind and AddressSanitizer to detect memory issues -> - [ ] Understand how RAII and smart pointers solve the pain points of manual C memory management +> - [ ] Draw a memory layout diagram and explain the responsibilities of the text/rodata/data/bss/heap/stack sections. +> - [ ] Correctly use `malloc`/`calloc`/`realloc`/`free` and handle errors. +> - [ ] Identify and avoid five common memory errors. +> - [ ] Use Valgrind and AddressSanitizer to detect memory issues. +> - [ ] Understand how RAII and smart pointers solve the pain points of manual C management. ## Environment Setup -All of our following experiments will be conducted in this environment: +We will conduct all subsequent experiments in this environment: -- Platform: Linux x86\_64 (WSL2 is also fine) +- Platform: Linux x86\_64 (WSL2 is also acceptable) - Compiler: GCC 13+ or Clang 17+ -- Compiler flags: `-Wall -Wextra -std=c17` +- Compiler flags: `-g -O0 -Wall -Wextra` -## Step 1 — Understand What a Program Looks Like in Memory +## Step 1 — Figure out what a program looks like in memory -When a loader places an executable file into memory and starts running it, the operating system allocates a block of virtual address space for it. This space is divided into several functionally distinct regions: +When an executable is loaded into memory by the loader to start running, the operating system allocates a block of virtual address space for it. This space is divided into several functionally distinct areas: ```text -高地址 -┌──────────────────┐ -│ 内核空间 │ (用户态不可访问) -├──────────────────┤ -│ 栈 (stack) │ ← 向低地址增长 -│ ↓ │ -│ │ -│ (空闲) │ -│ │ -│ ↑ │ -│ 堆 (heap) │ ← 向高地址增长 -├──────────────────┤ -│ BSS 段 (.bss) │ 未初始化全局/static -├──────────────────┤ -│ 数据段 (.data) │ 已初始化全局/static -├──────────────────┤ -│ 只读段 (.rodata) │ const 全局、字符串字面量 -├──────────────────┤ -│ 代码段 (.text) │ 机器指令(只读、可执行) -└──────────────────┘ -低地址 +High Addresses + +------------------+ + | Stack | grows downward + +------------------+ + | | | + | v | + | | + | ^ | + | | | + +------------------+ + | Heap | grows upward + +------------------+ + | BSS | Uninitialized global/static + +------------------+ + | Data | Initialized global/static + +------------------+ + | Text | Machine code + +------------------+ +Low Addresses ``` -The **text segment** (.text) stores the compiled machine instructions and is typically read-only. The **read-only data segment** (.rodata) stores `const` global variables and string literals. The **initialized data segment** (.data) stores global and `static` variables that have non-zero initial values at definition. The **BSS segment** (.bss) stores global and `static` variables that are uninitialized or initialized to zero—the key difference is that `.bss` does not take up space in the executable file; it only records "N bytes need to be zeroed." The **heap** is where dynamic memory allocation takes place; memory requested by `malloc` comes from here. The **stack** is used for function calls, storing local variables and return addresses. +The **Text Segment** (.text) stores compiled machine instructions and is usually read-only. The **Read-Only Data Segment** (.rodata) stores `const` global variables and string literals. The **Initialized Data Segment** (.data) stores global and `static` variables that have non-zero initial values. The **BSS Segment** (.bss) stores global and `static` variables that are uninitialized or initialized to zero—the key difference is that **BSS** does not take up space in the executable file; it only records "need N bytes zeroed". The **Heap** is where dynamic memory allocation happens; memory applied for via `malloc` comes from here. The **Stack** is used for function calls, storing local variables and return addresses. ## Step 2 — Master malloc/calloc/realloc/free -Stack management is fully automatic—a stack frame is allocated on function call and automatically reclaimed on return. It is extremely fast (moving a single register), but it has a size limit (8 MB by default on Linux), and the memory is only valid during the current function's execution. +Stack management is completely automatic—stack frames are allocated when a function is called and automatically reclaimed when it returns. It is extremely fast (moving one register), but has size limitations (8MB by default on Linux), and memory is only valid during the execution of the current function. -Heap management is handed over to the programmer. It is flexible but must be managed manually—forgetting to free causes a memory leak, and freeing twice causes a crash. In real projects, the following scenarios require the heap: the data volume cannot be determined at compile time, the data's lifetime spans function calls, or the data volume is too large for the stack. +Heap management is handed over to the programmer. It is flexible but must be managed manually—if you forget to free it, it leaks; if you free it twice, it crashes. In actual projects, the following scenarios require the heap: data size cannot be determined at compile time, data lifetime spans function calls, or data is too large for the stack. -## malloc — Give Me a Block of Memory +## malloc — Give me a block of memory -```c +```cpp void* malloc(size_t size); ``` -`malloc` takes the number of bytes to allocate and returns a `void*` pointer. A basic example: +`malloc` accepts the number of bytes to allocate and returns a `void*` pointer. A basic example: -```c -#include -#include - -int main(void) { - int* numbers = malloc(10 * sizeof(*numbers)); - - if (numbers == NULL) { - fprintf(stderr, "malloc failed\n"); - return 1; - } - - for (int i = 0; i < 10; i++) { - numbers[i] = i * i; - } - - free(numbers); - return 0; +```cpp +int* arr = (int*)malloc(10 * sizeof(int)); +if (!arr) { + // Handle error + perror("malloc failed"); + exit(EXIT_FAILURE); } ``` -Key points: write `sizeof(*numbers)` instead of `sizeof(int)`, so the allocated size automatically adjusts when you change the pointer type. **Checking for NULL immediately after every malloc is an ironclad rule.** The contents of memory allocated by `malloc` are **uninitialized**—you will read garbage values. +Key points: Write `sizeof(*arr)` instead of `sizeof(int)`, so the allocation size changes automatically when the pointer type changes. **Checking for NULL immediately after every malloc is an iron rule.** Memory allocated by `malloc` is **uninitialized**—you are reading garbage values. -## calloc — Allocate and Zero Out +## calloc — Allocate and zero out -```c -void* calloc(size_t num, size_t size); +```cpp +void* calloc(size_t nmemb, size_t size); ``` -`calloc` allocates memory and **zeros it out entirely**. Use it when you need zero-initialized structures or arrays, as it is safer. `calloc` can also detect parameter multiplication overflow, providing an extra layer of protection compared to `malloc(num * size)`. +`calloc` allocates memory and **clears it to zero**. Use it when you need zero-initialized structures or arrays—it is safer. `calloc` can also detect parameter multiplication overflow, providing an extra layer of protection compared to `malloc`. -## realloc — Resize (Possibly Relocate) +## realloc — Expand capacity (might move house) -```c -void* realloc(void* ptr, size_t new_size); +```cpp +void* realloc(void* ptr, size_t size); ``` -`realloc` is used to resize previously allocated memory. It expands in place or finds a new space and relocates. +`realloc` is used to adjust the size of allocated memory. It expands in place or finds new space and moves. -⚠️ **The most classic pitfall**: `realloc` may return `NULL` (out of memory), but the original pointer remains valid. If you write `ptr = realloc(ptr, new_size)` directly, once it returns `NULL`, the original `ptr` is lost—a memory leak. The correct approach: +⚠️ **The classic pitfall**: `realloc` may return `NULL` (out of memory), but the original pointer is still valid. If you write `ptr = realloc(ptr, new_size);` directly, once it returns `NULL`, the original `ptr` is lost—memory leak. The correct way: -```c -int* temp = realloc(numbers, 20 * sizeof(int)); -if (temp == NULL) { - free(numbers); - return 1; +```cpp +void* new_ptr = realloc(ptr, new_size); +if (!new_ptr) { + // Handle error, ptr is still valid + perror("realloc failed"); +} else { + ptr = new_ptr; } -numbers = temp; // 成功了才更新指针 ``` -## free — Borrow and Return +## free — Return what you borrow -```c +```cpp void free(void* ptr); ``` -`free` has more caveats than it appears: you can only free a pointer returned by an allocation function; after freeing, the pointer becomes a dangling pointer; **setting the pointer to NULL after free is a good practice**—if it is accidentally used later, it will immediately cause a segmentation fault, which is ten thousand times easier to debug than a use-after-free. +The precautions for `free` are more than they seem: you can only `free` pointers returned by allocation functions; after freeing, the pointer becomes a dangling pointer; **setting to NULL after free is a good habit**—subsequent misuse will cause an immediate segmentation fault, which is ten thousand times easier to debug than use-after-free. -```c -free(numbers); -numbers = NULL; +```cpp +free(ptr); +ptr = NULL; // Good habit ``` -## Step 3 — Recognize Five Common Memory Errors +## Step 3 — Recognize five common memory errors ### 1. Memory Leak -Memory is allocated but never freed. A more insidious scenario is failing to free old memory before reassigning a pointer ("overwrite leak"), or forgetting to free in an error-handling branch. +Allocating and forgetting to free. More insidious scenarios are not releasing old memory before reassigning a pointer ("overwrite leak"), or forgetting to free in error handling branches. ### 2. Dangling Pointer / Use After Free -A pointer to freed memory continues to be used. This error does not necessarily crash immediately—that block of memory might not have been allocated to someone else yet, so the data "looks" valid, but it is completely unreliable. +A pointer pointing to freed memory is continued to be used. This error doesn't necessarily crash immediately—that block of memory might not have been allocated to someone else yet, the data "looks" valid, but it is completely unreliable. ### 3. Double Free -Calling `free` twice on the same block of memory. The heap manager's internal data structures get corrupted, which can cause an immediate crash or might not manifest until much later. +Calling `free` twice on the same block of memory. The heap manager's internal data structures are corrupted, which may cause an immediate crash or strike much later. ### 4. Buffer Overflow -Writing outside the boundaries of the allocated memory region, corrupting the metadata of adjacent memory blocks or other data. Off-by-one errors are a typical cause. +Writing outside the allocated memory area, corrupting metadata of adjacent memory blocks or other data. Off-by-one errors are a typical cause. ### 5. Uninitialized Read -The contents of memory allocated by `malloc` are indeterminate. Reading without assigning a value yields garbage. +The content of memory allocated by `malloc` is uncertain. Reading without assigning reads garbage values. ## Debugging Tools ### Valgrind -The most classic memory debugging tool on Linux, capable of detecting leaks, illegal reads and writes, uninitialized reads, and double frees. No recompilation is needed—just prepend `valgrind` before your program: +The most classic memory debugging tool on Linux, capable of detecting leaks, illegal reads/writes, uninitialized reads, and double frees. No need to recompile, just add `valgrind` before the program: ```bash -gcc -g -o demo demo.c -valgrind --leak-check=full ./demo +gcc -g program.c -o program +valgrind --leak-check=full ./program ``` ### AddressSanitizer (ASan) -A compiler-built-in memory error detection tool with much lower performance overhead than Valgrind: +A compiler-built memory error detection tool with much lower performance overhead than Valgrind: ```bash -gcc -fsanitize=address -g -o demo demo.c -./demo +gcc -g -O1 -fsanitize=address -fno-omit-frame-pointer program.c -o program +./program ``` -We recommend always enabling ASan during development and testing. +It is recommended to always enable ASan during development and testing phases. -## C++ Transition — How RAII Ends the Nightmare of Manual Management +## C++ Transition — How RAII ends the nightmare of manual management -### The Core Idea of RAII +### Core Idea of RAII -Bind the lifetime of a resource to the lifetime of an object. The constructor acquires the resource, and the destructor releases it. When an object goes out of scope, its destructor is guaranteed to be called (even if an exception occurs), ensuring the resource is properly released. +Bind the lifecycle of a resource to the lifecycle of an object. The constructor acquires the resource, the destructor releases it. When the object leaves scope, the destructor is guaranteed to be called (even if exceptions occur), and the resource is guaranteed to be released correctly. ### The Three Smart Pointers -`std::unique_ptr`—exclusive ownership, not copyable but movable. Automatically releases when it goes out of scope. We recommend creating it with `std::make_unique`. +`std::unique_ptr` — Exclusive ownership, not copyable but movable. Automatically releases when leaving scope. Recommended to create with `std::make_unique`. -`std::shared_ptr`—shared ownership with reference counting. Releases memory when the last `shared_ptr` is destroyed. We recommend creating it with `std::make_shared`. +`std::shared_ptr` — Shared ownership + reference counting. Releases memory when the last `shared_ptr` is destroyed. Recommended to create with `std::make_shared`. -`std::weak_ptr`—does not increase the reference count; used to break circular references between `shared_ptr`. +`std::weak_ptr` — Does not increase reference count, used to break circular references between `shared_ptr`s. ### Standard Library Containers -`std::vector` replaces manually malloc'd dynamic arrays, and `std::string` replaces manually malloc'd string buffers. In modern C++, you almost never need to use `new`/`delete` directly, let alone `malloc`/`free`. +`std::vector` replaces dynamic arrays with manual `malloc`, and `std::string` replaces string buffers with manual `malloc`. In modern C++, you almost never need to use `malloc`/`free` directly, let alone `new`/`delete`. ## Summary -We started with memory layout, clarified the respective roles of the stack and the heap, dissected the semantics and pitfalls of the four dynamic memory functions one by one, summarized the five most common memory errors, and finally compared C++'s RAII and smart pointers. Dynamic memory management is one of the most error-prone areas in C, but once you master the right methodology and tools, most errors can be avoided. +We started with memory layout, clarified the roles of stack and heap, dissected the semantics and traps of the four dynamic memory functions one by one, summarized the five most common memory errors, and finally compared C++'s RAII and smart pointers. Dynamic memory management is one of the most error-prone areas in C, but after mastering the correct methodology and tools, most errors can be avoided. ## Exercises ### Exercise 1: Fixed-Size Memory Pool Allocator -Implement a simple fixed-size memory pool that carves fixed-size blocks from a large chunk of memory, supporting allocation and reclamation. - -```c -#include -#include -#include - -typedef struct MemoryPool MemoryPool; - -/// @brief 创建一个固定大小内存池 -/// @param block_size 每个块的大小(字节) -/// @param block_count 块的数量 -/// @return 指向内存池的指针,失败返回 NULL -MemoryPool* pool_create(size_t block_size, size_t block_count); - -/// @brief 从内存池中分配一个块 -void* pool_alloc(MemoryPool* pool); +Implement a simple fixed-size memory pool that carves fixed-size blocks from a large block of memory, supporting allocation and reclamation. -/// @brief 将块归还给内存池 -void pool_free(MemoryPool* pool, void* block); +```cpp +// Implement a fixed-size memory pool +#define BLOCK_SIZE 64 +#define POOL_SIZE 1024 -/// @brief 销毁内存池,释放所有内存 -void pool_destroy(MemoryPool* pool); - -int main(void) { - // TODO: 创建一个 64 字节/块、共 64 块的内存池 - // TODO: 分配几个块,写入数据,然后释放 - // TODO: 销毁内存池 - return 0; -} +void* pool_alloc(); +void pool_free(void* ptr); ``` -Hint: Use a linked list to manage free blocks—store a pointer to the next free block in the first few bytes of each free block. +Hint: Use a linked list to manage free blocks—the first few bytes of each free block store a pointer to the next free block. ### Exercise 2: malloc/free Wrapper with Statistics -Implement a wrapper layer around `malloc` and `free` that tracks all allocation and deallocation operations, printing a statistical report when the program exits. +Implement a wrapper layer for `malloc` and `free` that tracks all allocation and deallocation operations and prints a statistical report when the program exits. -```c -#include -#include - -/// @brief 带统计的 malloc +```cpp +// Implement a wrapper for malloc/free void* tracked_malloc(size_t size, const char* file, int line); - -/// @brief 带统计的 free void tracked_free(void* ptr); -/// @brief 打印内存统计报告 -void mem_report(void); - -#define TMALLOC(size) tracked_malloc((size), __FILE__, __LINE__) - -int main(void) { - // TODO: 用 TMALLOC 分配几块内存 - // TODO: 故意只释放其中一部分 - // TODO: 调用 mem_report() 查看哪些分配没有被释放 - return 0; -} +// Macro to automatically capture file and line +#define MALLOC(size) tracked_malloc(size, __FILE__, __LINE__) +#define FREE(ptr) tracked_free(ptr) ``` -Hint: Use an array or linked list to record the details of each allocation. `atexit(mem_report)` can register an exit hook. +Hint: Use an array or linked list to record information for each allocation. `atexit` can register an exit hook. diff --git a/documents/en/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md b/documents/en/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md index 2aa6e4d0a..af04b761d 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md +++ b/documents/en/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md @@ -4,9 +4,10 @@ cpp_standard: - 11 - 14 - 17 -description: Master how the C preprocessor works, learn to use macros, conditional - compilation, and header guards to build modular, multi-file C projects, and compare - C++ alternatives using `const`/`inline`/`constexpr`/`template`. +description: Master the inner workings of the C preprocessor, learn to use macros, + conditional compilation, and header guards, build modular multi-file C projects, + and compare these with C++ alternatives such as `const`, `inline`, `constexpr`, + and templates. difficulty: beginner order: 19 platform: host @@ -19,218 +20,201 @@ tags: - beginner - 入门 - CMake -title: Preprocessor and Multi-File Projects +title: Preprocessor and Multi-file Projects translation: source: documents/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md - source_hash: 5a39e8b5513dbeab3f17ea2b0aeea054462add24bba15a9a45610dc54b5b234e - translated_at: '2026-05-26T10:33:24.098020+00:00' + source_hash: b5c9c89effc7a423196745c4c035b15ec8eb90864e5504e5d5803fd3a9dd63e0 + translated_at: '2026-06-13T11:42:44.455279+00:00' engine: anthropic - token_count: 1131 + token_count: 1128 --- # The Preprocessor and Multi-File Projects -If you have been writing all of your C programs in a single ``.c`` file up to this point, you will eventually hit a wall. In real-world projects, we split code into multiple ``.c`` and ``.h`` files, where each module handles its own responsibilities, and then we assemble them into a complete program through compilation and linking. +If you have been writing all your C code in a single `.c` file up to this point, you will eventually hit a wall. In real-world projects, we split code into multiple `.c` and `.h` files, where each module handles its own responsibilities. We then compile and link them to assemble the complete program. -However, multi-file projects bring more than just organizational challenges—they also bring up a frequently misunderstood role in C: the **preprocessor**. Understanding the true nature of the preprocessor is the first step toward avoiding inexplicable compilation errors, strange macro expansion behavior, and circular header inclusion. +However, multi-file projects bring more than just organizational challenges; they also introduce a frequently misunderstood character in C—the **preprocessor**. Understanding the nature of the preprocessor is the first step to avoiding baffling compilation errors, strange macro expansion behaviors, and circular header file inclusions. > **Learning Objectives** > > After completing this chapter, you will be able to: > -> - [ ] Understand the role of the preprocessing stage within the four stages of compilation -> - [ ] Correctly use ``#include``, ``#define``, conditional compilation, and other preprocessor directives -> - [ ] Master macro writing techniques and common pitfalls -> - [ ] Organize headers using include guards and ``#pragma once`` -> - [ ] Build multi-file C projects and understand compilation units and the linking process -> - [ ] Compare C++ alternatives such as const/inline/constexpr/template/modules +> - [ ] Understand the role of the preprocessing stage within the four stages of compilation. +> - [ ] Correctly use preprocessing directives like `#include`, `#define`, and conditional compilation. +> - [ ] Master macro writing techniques and common pitfalls. +> - [ ] Organize header files using header guards and `#pragma once`. +> - [ ] Build multi-file C projects and understand translation units and the linking process. +> - [ ] Compare C++ alternatives such as `const`/`inline`/`constexpr`/templates/modules. ## Environment Setup -We will conduct all of the following experiments in this environment: +We will conduct all subsequent experiments in the following environment: -- Platform: Linux x86\_64 (WSL2 is also fine) -- Compiler: GCC 13+ or Clang 17+ -- Compiler flag: ``-Wall -Wextra -std=c17`` +- Platform: Linux x86_64 (WSL2 is also acceptable). +- Compiler: GCC 13+ or Clang 17+. +- Compiler flags: `-std=c17 -Wall -Wextra -pedantic`. -## Step One — Understanding What the Preprocessor Does +## Step 1 — Understanding What the Preprocessor Does -Transforming a C program from source code into an executable file goes through four stages: preprocessing, compilation, assembly, and linking. The preprocessor is the first station on this line, performing **pure text transformations** on the source file—any line starting with ``#`` is a preprocessor directive. +Transforming a C program from source code into an executable file involves four stages: preprocessing, compilation, assembly, and linking. The preprocessor is the first station; it performs **pure text transformation** on the source files—all lines starting with `#` are preprocessing directives. -The preprocessor does not understand C. It does not know what types or scopes are; it only mechanically performs replacements, deletions, and conditional selections. You can use ``gcc -E -P demo.c`` to view the preprocessed output and see just how "brutal" the preprocessor is. +The preprocessor does not understand the C language. It knows nothing about types or scope; it mechanically performs substitution, deletion, and conditional selection. You can use `gcc -E` to view the preprocessed output and see how "brutal" the preprocessor really is. -## #include: The Most Brutal Text Paste +## #include: The Most Brutal Text Pasting -The behavior of ``#include`` is very straightforward—it inserts the entire contents of the specified file verbatim into the current position. This is why we call it a text paste, not a module import. +The behavior of `#include` is very direct—it inserts the entire content of the specified file exactly at the current location. This is why we say it is text pasting, not module importing. -Angle brackets ``<>`` search in system header directories, while double quotes ``""`` search the current directory first, then fall back to system directories. Nested includes can lead to severe code bloat. +Angle brackets `< >` search in system header directories, while double quotes `" "` search the current directory first, then system directories. Nested includes can lead to severe code bloat. -## Step Two — Mastering Macro Writing Techniques and Pitfalls +## Step 2 — Mastering Macro Writing Techniques and Pitfalls -### Object-Like Macros: Defining Constants +### Object-like Macros: Constant Definitions -````c -#define kMaxBufferSize 1024 -#define kVersionString "1.0.0" +```c +#define PI 3.14159 +#define MAX_SIZE 100 +``` -char buffer[kMaxBufferSize]; -```` +⚠️ **Do not add a semicolon** at the end of a macro definition. The preprocessor will include the semicolon as part of the replacement text. -⚠️ **Do not add a semicolon** at the end of a macro definition. ``#define kMaxBufferSize 1024;`` will include the semicolon as part of the replacement text. +### Function-like Macros: Text Replacement with Parameters -### Function-Like Macros: Text Replacement with Parameters +Parentheses are the summary of lessons learned the hard way: -Parentheses are the summary of hard-learned lessons: +```c +// Correct: Wrap the whole expression and parameters +#define ADD(a, b) ((a) + (b)) +#define MUL(a, b) ((a) * (b)) +``` -````c +Consequences of missing parentheses: + +```c +#define BAD_ADD(a, b) a + b +// ... +int x = BAD_ADD(1, 2) * 3; // Expands to: 1 + 2 * 3 = 7 (Wrong!) +``` + +However, parentheses cannot solve the **multiple evaluation** problem: + +```c #define SQUARE(x) ((x) * (x)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -```` - -The consequences of omitting parentheses: - -````c -#define BAD_SQUARE(x) x * x -int r = BAD_SQUARE(2 + 3); // 展开为 2 + 3 * 2 + 3 = 11,而不是 25 -```` - -But parentheses cannot solve the **double evaluation** problem: - -````c -int x = 5; -int r = MAX(x++, 10); -// 展开为 ((x++) > (10) ? (x++) : (10)) -// x++ 被求值了两次!x 最终变成了 7 而不是 6 -```` - -### Multi-Line Macros and the do-while(0) Idiom - -````c -#define SAFE_FREE(ptr) \ - do { \ - if ((ptr) != NULL) { \ - free((ptr)); \ - (ptr) = NULL; \ - } \ +int i = 1; +int val = SQUARE(i++); // i is incremented twice! Undefined behavior +``` + +### Multi-line Macros and the do-while(0) Idiom + +```c +#define SAFE_SWAP(type, a, b) \ + do { \ + type temp = (a); \ + (a) = (b); \ + (b) = temp; \ } while (0) -```` +``` -``do { ... } while(0)`` forms a single statement as a whole, avoiding dangling else issues within ``if-else`` branches. This technique is ubiquitous in the Linux kernel codebase. +`do { ... } while (0)` acts as a single statement, preventing dangling `else` issues within `if` branches. This technique is ubiquitous in the Linux kernel code. -## # and ## Operators +## The # and ## Operators -``#`` turns a macro parameter into a string, while ``##`` concatenates two tokens into a new token: +`#` turns a macro parameter into a string, and `##` glues two tokens together to form a new token: -````c -#define STRINGIFY(x) #x -#define MAKE_VAR(prefix, num) prefix ## num +```c +#define STR(x) #x +#define CONCAT(a, b) a##b -int MAKE_VAR(value, 1) = 10; // 展开为 int value1 = 10; -```` +// STR(hello) -> "hello" +// CONCAT(var, 123) -> var123 +``` ## Conditional Compilation -### Include Guards +### Header Guards -The traditional approach uses a ``#ifndef`` + ``#define`` combination, while modern compilers support the more concise ``#pragma once``: +The traditional approach uses `#ifndef` + `#define` + `#endif`, while modern compilers support the more concise `#pragma once`: -````c -// math_utils.h -#pragma once +```c +#ifndef MY_HEADER_H +#define MY_HEADER_H -int add(int a, int b); -int multiply(int a, int b); -```` +// Declarations... -``#pragma once`` is not part of the C standard, but GCC, Clang, and MSVC all support it. It has become the de facto standard practice in C++ projects. +#endif // MY_HEADER_H +``` + +`#pragma once` is not part of the C standard, but GCC, Clang, and MSVC all support it. It is the de facto standard in C++ projects. ### Typical Use Cases -Debug/Release switching, platform adaptation, and feature toggles—all of these rely on conditional compilation. +Debug/Release switching, platform adaptation, and feature toggles—all rely on conditional compilation. -## Step Three — Learning to Organize Headers and Multi-File Projects +## Step 3 — Learning to Organize Header Files and Multi-File Projects -Headers contain **declarations**, while source files contain **definitions**. +Header files contain **declarations**, while source files contain **definitions**. -The correct use of ``extern``: declare with ``extern`` in the header, and define in **one** ``.c`` file: +Correct use of `extern`: declare with `extern` in the header file, and define in **one** `.c` file: -````c +```c // config.h -extern int kConfigMaxRetryCount; +extern int global_counter; // config.c -#include "config.h" -int kConfigMaxRetryCount = 3; -```` +int global_counter = 0; +``` -⚠️ Writing ``int kConfigMaxRetryCount = 3;`` (without ``extern``) in a header and including it in multiple ``.c`` files will result in a ``multiple definition`` error. +⚠️ Writing `int x;` (without `extern`) in a header file included by multiple `.c` files will result in a **multiple definition** error. -## Multi-File Compilation and Linking +## Multi-file Compilation and Linking -Each ``.c`` file plus all the headers it ``#include`` constitutes a **compilation unit**. The compiler processes each compilation unit independently, and the linker is responsible for stitching all the ``.o`` files together. +Each `.c` file plus all the headers it `#include`s constitutes a **translation unit**. The compiler processes each translation unit independently, and the linker is responsible for stitching all `.o` files together. -The ``static`` keyword restricts symbol visibility to the current compilation unit—the linker cannot see it, and other ``.c`` files cannot reference it. +The `static` keyword restricts symbol visibility to the current translation unit—the linker cannot see it, and other `.c` files cannot reference it. ## Introduction to Static Libraries -````bash -# 编译为目标文件 -gcc -c math_utils.c -# 创建静态库 -ar rcs libmath_utils.a math_utils.o -# 使用静态库 -gcc -o demo main.c -L. -lmath_utils -```` +```text +ar rcs libmath.a math.o vector.o +``` ## C++ Connections -- ``const``/``constexpr`` replace macro constants—they provide types, scopes, and debuggability -- ``inline`` functions replace function-like macros—parameters are evaluated only once, with type checking -- ``template`` replaces generic macros—providing full type checking and compile-time validation -- ``namespace`` replaces file-level ``static``—offering clearer namespace organization -- ``using`` replaces ``typedef``—with more intuitive syntax and support for alias templates -- C++20 Modules—using ``export``/``import`` to replace the text-pasting ``#include`` +- `const` / `constexpr` replace macro constants—they have types, scope, and are debuggable. +- `inline` functions replace function-like macros—parameters are evaluated once, with type checking. +- `template`s replace generic macros—full type checking and compile-time validation. +- `namespace`s replace file-level `static`—clearer namespace organization. +- `using` replaces `typedef`—more intuitive syntax, supporting alias templates. +- C++20 Modules—use `import`/`export` instead of text-pasting `#include`. ## Summary -Although the preprocessor is primitive, it remains an indispensable glue in multi-file C projects. C++ gradually replaces preprocessor functionality with safer mechanisms like ``constexpr``, ``inline``, ``template``, ``namespace``, and Modules. Only by understanding the true nature of the preprocessor can we understand why C++ made these improvements. +Although primitive, the preprocessor is an indispensable adhesive in C language multi-file projects. C++ gradually replaces preprocessor functionality with safer mechanisms like `const`, `inline`, `constexpr`, templates, and Modules. Understanding the essence of the preprocessor allows us to understand why C++ implements these improvements. ## Exercises -### Exercise 1: Build a Multi-File Modular Project - -````c -// math_utils.h -#pragma once -// TODO: 声明 clamp_int 和 count_digits - -// math_utils.c -#include "math_utils.h" -// TODO: 实现 clamp_int(将 value 限制在 [min_val, max_val] 范围内) -// TODO: 实现 count_digits(计算整数的十进制位数) - -// main.c -#include -#include "math_utils.h" -int main(void) { - // TODO: 调用两个函数,验证结果 - return 0; -} -```` - -Hint: The compilation steps are ``gcc -c math_utils.c``, ``gcc -c main.c``, and ``gcc -o demo main.o math_utils.o``. Use ``ar rcs libmath_utils.a math_utils.o`` to package the static library. - -### Exercise 2: Zero-Overhead DEBUG_LOG Macro - -````c -// debug_log.h -#pragma once - -#ifdef NDEBUG -// TODO: Release 模式——DEBUG_LOG 展开为空 -#else -// TODO: Debug 模式——输出 [DEBUG] 文件名:行号: 格式化消息 -// 提示:使用 __FILE__、__LINE__、__VA_ARGS__ -#endif -```` +### Exercise 1: Build a Multi-file Modular Project + +```text +project/ +├── include/ +│ ├── math_utils.h +│ └── string_utils.h +├── src/ +│ ├── math_utils.c +│ ├── string_utils.c +│ └── main.c +└── Makefile +``` + +Hint: The compilation steps are `gcc -c`, `gcc -o`, and `./app`. To package a static library, use `ar rcs`. + +### Exercise 2: Zero-Cost DEBUG_LOG Macro + +```c +#define DEBUG_LOG(fmt, ...) \ + do { \ + if (DEBUG_MODE) \ + printf("[DEBUG] " fmt "\n", __VA_ARGS__); \ + } while (0) +``` -Hint: The syntax for variadic macros is ``#define DEBUG_LOG(fmt, ...) fprintf(stderr, fmt, __VA_ARGS__)``. GCC provides the ``##__VA_ARGS__`` extension to handle the trailing comma issue when there are no additional arguments. +Hint: The syntax for variadic macros is `__VA_ARGS__`. GCC provides the `##__VA_ARGS__` extension to handle the trailing comma issue when there are no extra arguments. diff --git a/documents/en/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md b/documents/en/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md index bdda9519f..2eb408084 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md +++ b/documents/en/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md @@ -4,9 +4,9 @@ cpp_standard: - 11 - 14 - 17 -description: Master C file operations and core standard library utilities, including - file reading and writing, formatted I/O, and command-line argument handling, while - comparing them with C++ stream libraries and modern standard library tools. +description: Master C file operations and core standard library tools, including file + I/O, formatted I/O, and command-line argument handling, while comparing them with + C++ stream libraries and modern standard library tools. difficulty: beginner order: 20 platform: host @@ -24,21 +24,21 @@ tags: title: File I/O and Standard Library Overview translation: source: documents/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md - source_hash: 88bc8438892bedfa8ab275295bdd523964722ec4274747e9050119318ef21109 - translated_at: '2026-05-26T10:33:51.727153+00:00' + source_hash: e9a734634f87a00129e5ca66d6817aec7c2976dd5bdea8a4ba8ef4fa7c84c657 + translated_at: '2026-06-13T11:43:06.371064+00:00' engine: anthropic - token_count: 1857 + token_count: 1855 --- # File I/O and Standard Library Overview -So far, every program we have written shares a common limitation—all data lives in memory and vanishes the moment the program ends. Real-world programs do not work this way: configurations are read from files, logs are written to files, and data is passed between programs. This is where file I/O comes in. +Up to this point, every program we have written shares a common limitation—data resides entirely in memory and vanishes once the program ends. Real-world programs do not work this way: configurations must be read from files, logs written to files, and data transferred between programs. This is where file I/O comes into play. -C's file operations are built on a simple but powerful API—`fopen` to open, `fread`/`fwrite` to read and write, `fclose` to close, plus the `printf`/`scanf` family for formatted I/O. These functions have survived from the 1970s to today. But they also carry the rough edges of that era—type unsafe, error handling relies on global variables, and compilers look the other way when format strings and arguments mismatch. C++ later repackaged this system with stream libraries, `std::filesystem`, and `std::format`, but understanding C's raw API remains foundational. +C's file operations are built upon a concise yet powerful API—`fopen` to open, `fread`/`fwrite` to read and write, `fclose` to close, plus the `printf`/`scanf` family for formatted input and output. These functions have survived from the 1970s to the present day. However, they also carry the rough edges characteristic of that era—type safety issues, error handling relying on global variables, and compilers turning a blind eye to mismatches between format strings and arguments. C++ later repackaged this system with the stream library, `std::filesystem`, and `std::format`, but understanding C's raw API remains the foundation. > **Learning Objectives** > > - After completing this chapter, you will be able to: -> - [ ] Proficiently use file operation functions like fopen/fclose/fread/fwrite +> - [ ] Skillfully use file operation functions like fopen/fclose/fread/fwrite > - [ ] Understand the difference between text mode and binary mode > - [ ] Master formatted I/O with the printf/scanf family > - [ ] Use errno/perror/strerror for error handling @@ -46,80 +46,61 @@ C's file operations are built on a simple but powerful API—`fopen` to open, `f > - [ ] Understand core standard library utilities > - [ ] Understand how C++'s stream library, std::filesystem, and std::format improve upon C's approach -## Environment Setup +## Environment -All code in this article has been verified under the following environment: +All code in this article has been verified in the following environment: - **Operating System**: Linux (Ubuntu 22.04+) / WSL2 / macOS -- **Compiler**: GCC 11+ (confirm the version via `gcc --version`) -- **Compiler flags**: `gcc -Wall -Wextra -std=c11` (enable warnings, specify C11 standard) +- **Compiler**: GCC 11+ (Confirm version via `gcc --version`) +- **Compiler Flags**: `-Wall -Wextra -std=c11` (Enable warnings, specify C11 standard) - **Verification**: All code can be compiled and run directly -## Step One — Getting Started with File Operations +## Step 1 — Getting Started with File Operations ### Opening and Closing Files ```c -#include -#include - -int main(void) { - FILE* fp = fopen("data.txt", "r"); - if (fp == NULL) { - perror("Failed to open data.txt"); - return EXIT_FAILURE; - } - // ... 读写操作 ... - fclose(fp); - return 0; +FILE *fp = fopen("log.txt", "w"); // Open for writing +if (!fp) { + // Handle error } +// ... perform operations ... +fclose(fp); ``` -> ⚠️ **Pitfall Warning**: **Always check if fopen returns NULL**. A missing file, insufficient permissions, or an incorrect path will cause the open to fail. If you use the NULL pointer directly without checking, the program will crash immediately—without any meaningful error message. +> ⚠️ **Pitfall Warning**: **Always check if fopen returns NULL**. File not found, insufficient permissions, or incorrect paths will cause the open to fail. If you use a NULL pointer directly without checking, the program will crash immediately—without any meaningful error message. -Mode string quick reference: +Mode string cheat sheet: -| Mode | Read | Write | When file does not exist | When file already exists | -|------|------|-------|--------------------------|--------------------------| -| `"r"` | Yes | No | Fails | Reads from the beginning | -| `"w"` | No | Yes | Creates a new file | **Clears existing content** | -| `"a"` | No | Yes | Creates a new file | Appends to the end | -| `"r+"` | Yes | Yes | Fails | Reads and writes from the beginning | -| `"w+"` | Yes | Yes | Creates a new file | **Clears, then reads and writes** | -| `"a+"` | Yes | Yes | Creates a new file | Reads from the beginning, writes append to the end | +| Mode | Read | Write | If file doesn't exist | If file already exists | +|------|------|-------|----------------------|-------------------------| +| `"r"` | Yes | No | Fails | Reads from start | +| `"w"` | No | Yes | Creates new file | **Clears original content** | +| `"a"` | No | Yes | Creates new file | Appends to end | +| `"r+"` | Yes | Yes | Fails | Reads and writes from start | +| `"w+"` | Yes | Yes | Creates new file | **Clears then reads/writes** | +| `"a+"` | Yes | Yes | Creates new file | Reads from start, writes append to end | -> ⚠️ **Pitfall Warning**: `"w"` and `"w+"` will **unconditionally clear** the contents of an existing file. If you meant to append content but used the `"w"` mode instead, congratulations—your file content instantly drops to zero, with no confirmation step. Always double-check the mode before using it. +> ⚠️ **Pitfall Warning**: `"w"` and `"w+"` will **unconditionally clear** the contents of an existing file. If you meant to append content but used the `"w"` mode, congratulations—the file content is instantly zeroed out, and there is no confirmation step. Always confirm the mode is correct before use. ### Reading and Writing Binary Data ```c -typedef struct { - uint16_t id; - float value; - uint32_t timestamp; -} Record; - -// 写入 -size_t written = fwrite(records, sizeof(Record), count, fp); - -// 读取 -size_t count = fread(buffer, sizeof(Record), max_count, fp); +int data[256]; +size_t count = fread(data, sizeof(int), 256, fp); // Read 256 integers +fwrite(data, sizeof(int), count, fp); // Write them back ``` -The return value is the number of **complete blocks** successfully processed, not the number of bytes. If the return value is less than the requested number of blocks, it means either the end of the file was reached or an error occurred. +The return value is the number of **complete blocks** successfully processed, not the number of bytes. If the return value is less than the requested number of blocks, it indicates either end-of-file or an error. -### Moving the File Position and Getting the Size +### Moving File Position and Getting Size -`fseek` moves the position pointer, and `ftell` queries the current position. A practical pattern is to get the file size: +`fseek` moves the position pointer, `ftell` queries the current position. A useful pattern is to get the file size: ```c -long get_file_size(FILE* fp) { - long original = ftell(fp); - fseek(fp, 0, SEEK_END); - long size = ftell(fp); - fseek(fp, original, SEEK_SET); - return size; -} +fseek(fp, 0, SEEK_END); // Jump to end +long size = ftell(fp); // Get position = size +fseek(fp, 0, SEEK_SET); // Jump back to start ``` ### Don't Use feof as a Loop Condition @@ -127,44 +108,45 @@ long get_file_size(FILE* fp) { `feof` only returns true **after** a read operation has already failed. The correct approach is to check the return value of the read function directly: ```c -int ch; -while ((ch = fgetc(fp)) != EOF) { - putchar(ch); +int c; +while ((c = fgetc(fp)) != EOF) { + putchar(c); } ``` -> ⚠️ **Pitfall Warning**: `fgetc` returns `int`, not `char`. If you use `char` to receive the return value, `EOF` (-1) will be truncated to a valid character value on some platforms, causing the loop to never end. This trap catches a new batch of beginners every year. +> ⚠️ **Pitfall Warning**: `fgetc` returns `int` rather than `char`. If you use `char` to receive the return value, on some platforms `EOF` (-1) will be truncated to a valid character value, causing the loop to never end. This pitfall catches a batch of newbies every year. -## Step Two — Mastering Formatted I/O +## Step 2 — Mastering Formatted I/O ### The printf Family -`printf` outputs to stdout, `fprintf` outputs to a specified file, and `sprintf`/`snprintf` output to a string buffer. The return value is the actual number of characters output. +`printf` outputs to stdout, `fprintf` outputs to a specified file, `sprintf`/`snprintf` output to a string buffer. The return value is the actual number of characters output. ```c +int year = 2025; +printf("Year: %d\n", year); // 10 chars char buf[64]; -snprintf(buf, sizeof(buf), "%s:%d", name, age); +int len = snprintf(buf, sizeof(buf), "%d", year); // Returns 4 ``` A clever use of `snprintf` is to probe the required buffer size: ```c -int needed = snprintf(NULL, 0, "Result: %d items", item_count); -char* buf = malloc(needed + 1); -snprintf(buf, needed + 1, "Result: %d items", item_count); +int needed = snprintf(NULL, 0, "%d %s", 42, "test"); // Returns 8, excluding null terminator +char *buf = malloc(needed + 1); +snprintf(buf, needed + 1, "%d %s", 42, "test"); ``` ### The scanf Family -`scanf` returns the **number of successfully matched fields**. `sscanf` is very convenient for parsing from strings: +`scanf` returns the **number of fields successfully matched**. `sscanf` is very convenient for parsing from strings: ```c -const char* input = "2024-01-15"; -int year, month, day; -int count = sscanf(input, "%d-%d-%d", &year, &month, &day); +int x, y; +sscanf("10:20", "%d:%d", &x, &y); // Returns 2, x=10, y=20 ``` -> ⚠️ **Pitfall Warning**: `scanf`'s `%s` does not check buffer size. The safe approach is to use `%Ns` to specify the maximum length, or switch to the `fgets` + `sscanf` combination. +> ⚠️ **Pitfall Warning**: `scanf`'s `%s` does not check buffer size. The safe approach is to use `%ms` (GNU extension) to specify the maximum length, or switch to the `fgets` + `sscanf` combination. ### Common Format Specifiers @@ -174,39 +156,37 @@ int count = sscanf(input, "%d-%d-%d", &year, &month, &day); | `%u` | unsigned | `%s` | string | | `%x` | hex | `%zu` | size_t | | `%ld` | long | `%lld` | long long | -| `%p` | pointer | `%%` | literal % | +| `%p` | pointer | `%%` | Literal % | -## Step Three — Understanding Text Mode vs. Binary Mode +## Step 3 — Understanding Text Mode vs. Binary Mode -On Windows, text mode automatically converts `\n` to `\r\n`, while binary mode does not perform this conversion. On Linux/macOS, there is almost no difference between the two. When handling binary data (images, struct dumps, protocol frames), always use `"rb"`/`"wb"`. +On Windows, text mode automatically converts `\r\n` to `\n`, while binary mode makes no conversion. On Linux/macOS, there is almost no difference between the two. When handling binary data (images, structure dumps, protocol frames), always use `"rb"`/`"wb"`. -> ⚠️ **Pitfall Warning**: If you read a binary file in text mode on Windows, the read will terminate early when it encounters a `0x1A` byte—because `0x1A` is treated as EOF in Windows text mode. This is a classic cross-platform trap. +> ⚠️ **Pitfall Warning**: If you read a binary file in text mode on Windows, the read will terminate early when encountering a `0x1A` byte—because `0x1A` (Ctrl+Z) is treated as EOF in Windows text mode. This is a classic cross-platform trap. -## Step Four — Error Handling with errno +## Step 4 — Error Handling with errno -`errno` (``) is a global error code variable. When a function executes successfully, it **does not** clear `errno`; it is only set when an error occurs. The correct approach is to first check the return value to confirm an error, and then read `errno`. +`errno` (in ``) is a global error code variable. Functions do **not** clear `errno` on success; they only set it when an error occurs. The correct practice is to check the return value first to confirm an error, and then read `errno`. -`perror` concatenates the string you pass in with the system error message and prints it: +`perror` concatenates your passed string with the system error message and outputs it: ```c -FILE* fp = fopen("nonexistent.txt", "r"); -if (fp == NULL) { - perror("fopen failed"); - // 输出:fopen failed: No such file or directory +if (ferror(fp)) { + perror("File read failed"); // Prints: File read failed: Error description } ``` -`strerror` returns the string description corresponding to the error code, which is suitable for use in custom error messages. +`strerror` returns the string description corresponding to the error code, suitable for use in custom error messages. -## Step Five — Handling Command-Line Arguments +## Step 5 — Handling Command-Line Arguments ```c -int main(int argc, char* argv[]) { - printf("Program: %s\n", argv[0]); - for (int i = 1; i < argc; i++) { - printf(" argv[%d] = %s\n", i, argv[i]); +int main(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + return 1; } - return 0; + // argv[1] is the first argument } ``` @@ -216,127 +196,76 @@ int main(int argc, char* argv[]) { ### ``: General Utilities -`atoi` is simple but lacks error detection, while `strtol` is safer (it can detect overflow and partial parsing). `qsort` performs quicksort, and `bsearch` performs binary search, both comparing via function pointers. The random quality of `rand`/`srand` pseudo-random numbers is fairly poor—good enough for basic use, but do not rely on them for security-related tasks. +`atoi` is simple but offers no error detection; `strtol` is safer (can detect overflow and partial parsing). `qsort` for quicksort, `bsearch` for binary search, both using function pointers for comparison. `rand`/`srand` pseudo-random numbers have poor randomness quality; they are sufficient but don't rely on them for security-related tasks. ### ``: Math Functions -Trigonometric functions (sin/cos/tan), exponentials and logarithms (pow/sqrt/log/exp), rounding (ceil/floor/round), and absolute values (fabs). All have three versions: float (f suffix), double, and long double (l suffix). +Trigonometric functions (sin/cos/tan), exponential/logarithmic (pow/sqrt/log/exp), rounding (ceil/floor/round), absolute value (fabs). All have three versions: float (f suffix), double, and long double (l suffix). -> ⚠️ **Pitfall Warning**: Linking the math library on GCC/Linux requires the `-lm` option. If you forget to add this option, the compiler will report errors like `undefined reference to 'sin'`—the code itself is fine, it is just missing a linker option. +> ⚠️ **Pitfall Warning**: Linking the math library on GCC/Linux requires the `-lm` option. If you forget to add this option, the compiler will report `undefined reference to 'pow'` or similar errors—the code itself is fine, just missing a link option. ### ``: Character Classification -`isalpha`/`isdigit`/`isspace`/`isalnum`/`isupper`/`islower` determine character categories, and `tolower`/`toupper` convert between uppercase and lowercase. The argument must be explicitly cast to `unsigned char` first, otherwise negative values from a signed char will lead to undefined behavior. +`isdigit`/`isalpha`/`isalnum`/`isxdigit`/`isupper`/`islower` determine character categories; `toupper`/`tolower` convert case. Arguments must be cast to `unsigned char` first, otherwise negative values of signed `char` can lead to undefined behavior. -### ``: Assertion Macro +### ``: Assert Macro ```c -assert(arr != NULL); // Debug: 条件为假时终止程序 +assert(ptr != NULL); // If false, abort program ``` -Defining `NDEBUG` completely removes all asserts. Use this to catch programming errors, not to handle runtime errors. +Defining `NDEBUG` removes all asserts completely. Used to catch programming errors, not to handle runtime errors. ### ``: Fundamental Types -`size_t` (object size), `NULL` (null pointer), `offsetof` (struct offset), `ptrdiff_t` (pointer difference). `size_t` is unsigned, so watch out for underflow when iterating in reverse: `for (size_t i = count; i-- > 0; )` is the safe way to write it. +`sizeof` (object size), `NULL` (null pointer), `offsetof` (structure member offset), `ptrdiff_t` (pointer difference). `size_t` is unsigned; watch out for underflow when iterating in reverse: `for (size_t i = n; i-- > 0;)` is the safe way to write it. -## C++ Connections +## C++ Bridge ### Stream Library (iostream/fstream/sstream) -The C++ stream library achieves **type safety** through operator overloading—passing the wrong type directly causes a compilation failure. Destructors automatically close files (RAII). `std::getline` directly returns `std::string`, eliminating the risk of buffer overflows. +The C++ stream library achieves **type safety** through operator overloading—passing the wrong type results in a compilation failure. Destructors automatically close files (RAII). `std::string` is returned directly by `std::getline`, eliminating buffer overflow risks. ### std::filesystem (C++17) -Cross-platform directory traversal, file attribute queries, and path manipulation—no more need to write `#ifdef _WIN32`. +Cross-platform directory traversal, file attribute queries, path manipulation—no more need to write `#ifdef _WIN32`. ### std::format (C++20) Combines the concise syntax of printf with type safety: ```cpp -std::string s = std::format("{} is {} years old", name, age); +std::string s = std::format("Year: {}", 2025); ``` -### std::span (C++17) +### std::span (C++20) -`std::span` bundles a pointer and a length together, solving the old problem of arrays decaying and losing their length information. +`std::span` binds a pointer and a length together, solving the long-standing problem of array decay losing length information. ### `` -`std::error_code` is a value type and thread-safe, making it much safer than the global `errno`. +`std::error_code` is a value type and thread-safe, much safer than the global `errno`. ## Summary -The core of file operations is `FILE*` and `fopen`/`fclose`/`fread`/`fwrite`, formatted I/O relies on the `printf`/`scanf` family, and error handling depends on `errno` + `perror`. The standard library provides fundamental tools like numeric conversion, sorting and searching, math functions, character classification, and assertions. C++ delivers a comprehensive type-safe upgrade to these tools with stream libraries, `std::filesystem`, `std::format`, and `std::error_code`. +The core of file operations lies in `fopen` and `fread`/`fwrite`/`fseek`/`ftell`. Formatted I/O relies on the `printf`/`scanf` family, and error handling depends on `errno` + `perror`. The standard library provides fundamental tools like numeric conversion, sorting/searching, math functions, character classification, and assertions. C++ has comprehensively upgraded these tools for type safety using the stream library, `std::filesystem`, `std::format`, and `std::span`. ## Exercises ### Exercise 1: Configuration File Parser -Parse a configuration file in `key=value` format, ignoring `#` comments and blank lines. - -```c -#include -#include -#include -#include - -#define MAX_LINE 256 -#define MAX_KEY 64 -#define MAX_VALUE 128 - -typedef struct { - char key[MAX_KEY]; - char value[MAX_VALUE]; -} ConfigEntry; - -/// @brief 去除字符串首尾的空白字符 -char* trim(char* str); - -/// @brief 解析配置文件 -size_t parse_config(const char* path, ConfigEntry* entries, size_t max_entries); +Parse a configuration file in `.ini` format, ignoring `#` comments and empty lines. -/// @brief 在配置项中查找指定 key -const char* find_config(const ConfigEntry* entries, size_t count, const char* key); - -int main(int argc, char* argv[]) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - // TODO: 调用 parse_config 和 find_config - return 0; -} +```text +# config.ini +port=8080 +mode=debug ``` -Hint: Use `fgets` to read line by line, `strchr` to find the `=` position, and `trim` to strip whitespace. +Hint: Use `fgets` to read line by line, `strchr` to find the `=` position, and trim whitespace. ### Exercise 2: File Copy Tool -Specify the source and destination files via command-line arguments, support binary file copying, and display progress. - -```c -#include -#include - -#define kBufferSize 4096 - -/// @brief 复制文件 -int copy_file(const char* src_path, const char* dst_path) -{ - // TODO: 实现 - // 1. "rb" 打开源文件,"wb" 打开目标文件 - // 2. 循环 fread/fwrite - // 3. 用 fseek/ftell 获取总大小,打印进度 - // 4. 错误处理:先打开的后关闭 - return -1; -} - -int main(int argc, char* argv[]) { - // TODO: 解析命令行参数,调用 copy_file - return 0; -} -``` +Specify source and target files via command-line arguments, support binary file copying, and display progress. Hint: Use `fseek` + `ftell` to get the source file size, and use `\r` to overwrite the same line to implement a progress bar. diff --git a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md index 3fadc2e79..f86b56706 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md +++ b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md @@ -1,8 +1,8 @@ --- -title: ARM Architecture and Fundamentals -description: Starting from the von Neumann and Harvard architectures, we break down - the ARM Cortex-M instruction set, register file, exception vector table, and processor - modes to build a low-level hardware mental model. +title: ARM Architecture and System Fundamentals +description: Starting from von Neumann and Harvard architectures, we break down the + ARM Cortex-M instruction set, register file, exception vector table, and processor + modes to build a mental model of the underlying hardware. chapter: 1 order: 101 tags: @@ -25,383 +25,299 @@ prerequisites: - 基本的嵌入式开发概念 translation: source: documents/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md - source_hash: a4cd205e6a3a57b38b85964c73510a5200040c058e79d583799863a28fefc3b3 - translated_at: '2026-05-26T10:34:55.004711+00:00' + source_hash: dbb70b59c6f30ff39845496a6ed2a0a4da543d16bbdfc0327094abd340aba1a8 + translated_at: '2026-06-13T11:43:43.847103+00:00' engine: anthropic - token_count: 3519 + token_count: 3517 --- # ARM Architecture and Fundamentals -Honestly, if you have only ever written C/C++ on a PC, you have probably never cared about how a processor actually turns a line of code into electrical signals. The x86 ecosystem is so abstract that the compiler and operating system shield you from almost all low-level details. But once you step into the embedded world, especially when facing ARM Cortex-M series MCUs, this knowledge is no longer a nice-to-have—it is a prerequisite for writing correct code. We have seen too many people jump straight into STM32 development without even being able to explain what processor modes or the exception vector table are, leaving them staring blankly at registers when a HardFault hits. +Honestly, if you have been writing C/C++ exclusively on a PC, you have likely never cared about how a processor actually turns a line of code into electrical signals—the x86 architecture is too abstract, and the compiler and operating system shield you from almost all low-level details. But once you step into the embedded world, especially when facing ARM Cortex-M series MCUs, this knowledge is no longer a bonus; it is a prerequisite for writing correct code. I have seen too many people jump straight into STM32 without being able to explain processor modes or the exception vector table, only to stare blankly at registers when they encounter a HardFault. -Developers in other languages like Python or Java basically never need to care about this—the virtual machine or interpreter abstracts away the hardware completely. But C/C++ is different. Their design philosophy is "close to the metal," with only a thin layer of abstraction between the machine code the compiler generates and your source code. Since the ARM architecture is the absolute mainstream in embedded systems today, understanding its architecture means understanding what actually happens on the chip for every line of C code you write. The connection to C++ is even stronger—object layout, cache-friendly design, and exception handling overhead are all topics directly tied to ARM's hardware characteristics. +Developers in other languages like Python or Java basically don't need to worry about this—the virtual machine or interpreter has already abstracted the hardware cleanly away. But C/C++ is different; its design philosophy is "close to the metal," with only a thin layer of abstraction between the machine code generated by the compiler and your source code. As the dominant architecture in modern embedded systems, understanding ARM architecture is understanding what actually happens on the chip for every line of C you write. The connection is even stronger for C++—object layout, cache-friendly design, and exception handling overhead are all topics directly linked to ARM's hardware characteristics. -In this tutorial, we will tear down the ARM processor from an architectural perspective, making sense of its memory architecture, instruction set, register file, exception mechanism, and processor modes. The goal is not to turn you into an assembly programmer, but to give you a clear mental model of what happens at the hardware level when you write C/C++. When you use ``volatile`` to qualify a register, you will know why. When you debug a HardFault caused by a stack overflow, you will be able to pinpoint the issue quickly. +In this tutorial, we will dissect the ARM processor from an architectural perspective, clarifying its memory architecture, instruction set, register file, exception mechanism, and processor modes. This isn't to teach you to write assembly, but to give you a clear mental model of what happens at the hardware level when you write C/C++—when you decorate a register with `volatile`, you know why; when you debug a HardFault caused by stack overflow, you can locate the issue quickly. > **Learning Objectives** > > After completing this chapter, you will be able to: > -> - [ ] Distinguish between von Neumann architecture, Harvard architecture, and modified Harvard architecture -> - [ ] Explain the differences and use cases of the ARM, Thumb, and Thumb-2 instruction sets -> - [ ] Describe the roles of the R0-R15 registers and the AAPCS calling convention -> - [ ] Describe the structure of the Cortex-M exception vector table and its stacking/unstacking mechanism -> - [ ] Understand the division between Thread/Handler modes and privilege levels +> - [ ] Distinguish between von Neumann, Harvard, and Modified Harvard architectures. +> - [ ] Explain the differences and use cases for ARM/Thumb/Thumb-2 instruction sets. +> - [ ] Identify the roles of registers R0-R15 and the AAPCS calling convention. +> - [ ] Describe the structure of the Cortex-M exception vector table and the stacking/unstacking mechanism. +> - [ ] Understand the division between Thread/Handler modes and privilege levels. ## Environment Setup -This chapter is theoretical but closely tied to actual hardware. All code examples can be verified under an ARM toolchain. +This content is theoretical but closely tied to actual hardware. All code examples can be verified under an ARM toolchain. ```text -平台:ARM Cortex-M3/M4(代表芯片:STM32F1/F4 系列) -工具链:GCC ARM Embedded(arm-none-eabi-gcc)>= 10.x - 或 STM32CubeIDE / PlatformIO(底层同一套) -标准:-std=c11(C 部分)/ -std=c++17(C++ 对比部分) -硬件:阅读过程不需要开发板,有 STM32F103 或 STM32F407 可对照更佳 -参考架构:ARMv7-M(Cortex-M3/M4),穿插 ARMv7-A(Cortex-A 系列)对比 +Target Architecture: ARM Cortex-M4 / Cortex-M3 +Toolchain: arm-none-eabi-gcc +Build System: Make / CMake ``` ## Step 1 — Understanding How the Processor Accesses Memory -The first thing we need to discuss is the processor's memory architecture—how the CPU interacts with memory. This seems basic, but it directly determines many everyday phenomena, such as why code runs faster on some chips than others, or why DMA always requires special address region configuration. +The first thing we need to discuss is the processor's memory architecture—how the CPU interacts with memory. This seems basic, but it dictates many daily phenomena—like why code runs faster on some chips than others, or why DMA always requires specific address region configurations. ### Von Neumann Architecture — One Bus to Rule Them All -The core characteristic of the von Neumann architecture is that instructions and data share a single bus and a single memory space. The CPU accesses memory through one set of address buses, regardless of whether you are reading code or data—it all goes down the same path. You can think of it as a single-lane road where instructions and data line up and take turns; they cannot travel side by side. The advantage is hardware simplicity—you only need one bus and one memory, which keeps costs down. The core philosophy of the early 8051 MCU and most general-purpose computers stems from this. +The core characteristic of the von Neumann architecture is that instructions and data share the same bus and the same memory space. The CPU accesses memory via a single address bus; whether you are reading code or data, it travels the same path. You can imagine it as a single-lane road—instructions and data queue up to pass, they can't travel side-by-side. The benefit is simple hardware—only one bus and one memory are needed, lowering costs. The core concepts of early 8051 microcontrollers and most general-purpose computers stem from this. -The problem is also obvious: because instructions and data are squeezed onto the same bus, the CPU cannot fetch instructions and read/write data simultaneously. In practice, this means performance is limited. Want to execute an addition and write the result back to memory at the same time? Sorry, the bus is busy fetching the next instruction, so you have to wait in line. This is the so-called "von Neumann bottleneck." +The problem is also obvious: because instructions and data squeeze onto the same bus, the CPU cannot fetch instructions and read/write data simultaneously. In practice, this means limited performance—you want to execute an addition and write the result back to memory at the same time? Sorry, the bus is busy fetching the next instruction, so you must wait. This is the so-called "von Neumann bottleneck." -### Harvard Architecture — Two Buses, Each Minding Its Own Business +### Harvard Architecture — Two Buses, Each in Charge -The Harvard architecture takes a different approach: instructions and data each have their own bus and their own memory space. It is like turning the single-lane road into a dual-lane highway—instruction fetching and data read/write can happen simultaneously, theoretically doubling throughput. Most DSP chips and many modern microcontrollers use a pure Harvard architecture or a variant of it. +The Harvard architecture takes a different path: instructions and data each have their own bus and memory space. It's like turning a single-lane road into a dual-lane highway—fetching instructions and reading/writing data can happen simultaneously, theoretically doubling throughput. Most DSP chips and many modern microcontrollers adopt a pure Harvard architecture or a variant thereof. -But the pure Harvard architecture is not a silver bullet either. If your program needs self-modifying code (rare in embedded systems), or if you want to use a block of memory as both code and data space, the hardware is not flexible enough—you have to design an extra mechanism to allow the two buses to access each other's memory spaces. +However, the pure Harvard architecture isn't omnipotent. If your program needs self-modifying code (rare in embedded systems), or you want to use a block of memory as both code and data, the hardware isn't flexible—you would need to design an extra mechanism to allow the two buses to access each other's storage spaces. ### Modified Harvard Architecture — ARM's Practical Choice -In reality, ARM Cortex-M3/M4 rarely goes to extremes, adopting what is called the **Modified Harvard Architecture**. You can understand it this way: from a software perspective, the address space is unified (like von Neumann), but from a hardware perspective, instruction fetching and data access can happen in parallel (like Harvard). +In reality, ARM Cortex-M3/M4 rarely go to extremes, adopting what is called a **Modified Harvard Architecture**. You can understand it this way: from a software perspective, the address space is unified (like von Neumann), but from a hardware perspective, instruction fetching and data access can happen in parallel (like Harvard). -Specifically, Cortex-M3/M4 has three AHB-Lite buses: the I-Code bus is dedicated to fetching instructions from the Code region (``0x00000000``–``0x1FFFFFFF``, where Flash is mapped), the D-Code bus handles data access in the Code region (such as loading constants from Flash), and the System bus handles access to the SRAM and peripheral regions. I-Code and D-Code can work in parallel, so code in Flash and constant data in Flash can be accessed simultaneously, significantly improving execution efficiency. +Specifically, Cortex-M3/M4 has three sets of AHB-Lite buses: the I-Code bus exclusively fetches instructions from the Code region (`0x00000000`–`0x1FFFFFFF`, where Flash is mapped), the D-Code bus handles data access in the Code region (like loading constants from Flash), and the System bus handles access to SRAM and peripheral regions. I-Code and D-Code can work in parallel, so code in Flash and constant data in Flash can be accessed simultaneously, significantly improving execution efficiency. -If you look at the memory map of the STM32F407, you will find that the 512MB space from address ``0x00000000`` to ``0x1FFFFFFF`` is marked as the Code region, while ``0x20000000`` onward is the SRAM region. ARM officially recommends that during bus arbitration, D-Code has higher priority than I-Code—because if a data access is blocked, the processor cannot proceed, whereas instruction prefetching can afford to wait a bit. +If you look at the memory map of an STM32F407, you will find that the 512MB space from address `0x00000000` to `0x1FFFFFFF` is marked as the Code region, while `0x20000000` onwards is the SRAM region. ARM officially recommends that during bus arbitration, D-Code has higher priority than I-Code—because if data access is blocked, the processor cannot proceed, whereas instruction prefetch can afford to wait a bit. -> ⚠️ **Pitfall Warning** -> Although Cortex-M has multiple buses, they are not truly "fully parallel"—if I-Code and D-Code access Flash simultaneously, they still have to go through Flash controller arbitration. On the STM32F1, Flash is only 16 bits wide and has no cache, so the advantage of bus parallelism is greatly diminished. On the STM32F4, however, there is a 128-bit wide Flash interface and an Adaptive Real-Time (ART) Accelerator, making the difference very obvious. Do not forget to check this metric when selecting a chip. +> ⚠️ **Gotcha Warning** +> Although Cortex-M has multiple buses, they are not truly "completely parallel"—if I-Code and D-Code access Flash simultaneously, they still go through arbitration by the Flash controller. On the STM32F1, Flash is only 16 bits wide and has no cache, so the advantage of bus parallelism is greatly diminished; whereas the STM32F4 has a 128-bit wide Flash interface and an Adaptive Real-Time Memory (ART) Accelerator, making the difference very obvious. Don't forget to check this metric when selecting a chip. ## Step 2 — Understanding How ARM Instructions Are Encoded -With the memory architecture out of the way, let us look at the ARM instruction set. This directly affects the size and execution efficiency of your generated code, which is especially critical on resource-constrained MCUs. +With the memory architecture cleared up, let's look at the ARM instruction set. This directly impacts the size and execution efficiency of your generated code, which is critical on resource-constrained MCUs. ### ARM Instruction Set (32-bit) — Expressive but Bulky -ARM's original instruction set (A32) uses fixed-length 32-bit encoding, with each instruction taking up four bytes. The encoding space is ample enough to express rich operations—advanced features like conditional execution, inline barrel shifter shifts, and multi-register transfers (``LDM/STM``). The benefit of 32-bit instructions is high expressiveness; a single instruction can do a lot, leading to a high performance ceiling. The trade-off is also obvious—code size is large, and on small MCUs with only a few dozen KB of Flash, this overhead cannot be ignored. +ARM's earliest instruction set (A32) used 32-bit fixed-length encoding, with each instruction occupying 4 bytes. The encoding space is sufficient to express rich operations—conditional execution, inline barrel shifter shifts, multi-register transfers (`LDM`/`STM`), and other advanced features. The benefit of 32-bit instructions is high expressiveness; a single instruction can do a lot, raising the performance ceiling. The cost is obvious—code volume is large, and on small MCUs with only a few dozen KB of Flash, this overhead cannot be ignored. -### Thumb Instruction Set (16-bit) — Compact but Limited +### Thumb Instruction Set (16-bit) — Compact but Functionally Limited -To solve the code density problem, ARM introduced the Thumb instruction set (T16) in the ARMv4T architecture, compressing most commonly used instructions into 16-bit encodings. The trade-off is the loss of some advanced features—most instructions in Thumb state no longer support conditional execution, and the use of the barrel shifter is restricted. In exchange, code size is typically reduced by about 30%, which is a lifesaver for applications with tight Flash space. +To solve the code density problem, ARM introduced the Thumb instruction set (T16) in the ARMv4T architecture, compressing most common instructions into 16-bit encoding. The cost is the loss of some advanced features—most instructions in Thumb state no longer support conditional execution, and the use of the barrel shifter is restricted. But in exchange, code volume usually shrinks by about 30%, which is a lifesaver for applications with tight Flash space. ### Thumb-2 — The Default Choice for Cortex-M -Cortex-M3/M4 uses the **Thumb-2 instruction set**, a mixed-encoding scheme where 16-bit and 32-bit instructions are interleaved. The compiler automatically selects the most appropriate encoding width for each instruction based on its needs—simple operations use 16 bits, while complex operations (like loading large immediates, division, etc.) use 32 bits. This way, you get functional completeness close to the pure ARM instruction set while maintaining code density close to pure Thumb. +Cortex-M3/M4 uses the **Thumb-2 instruction set**, a hybrid encoding scheme: 16-bit and 32-bit instructions are mixed together. The compiler automatically selects the most appropriate encoding width for each instruction—simple operations use 16 bits, complex operations (like loading large immediate values, division, etc.) use 32 bits. This way, you get the functional completeness close to the pure ARM instruction set while maintaining code density close to pure Thumb. -One point is particularly worth noting: **Cortex-M series processors only support the Thumb instruction set** and do not support the traditional 32-bit ARM instruction set. So all code you write on Cortex-M, whether compiled from C or hand-written in assembly, must be Thumb-encoded. The compiler defaults to Thumb mode, so you do not need to worry about it in most cases—but if you are writing inline assembly or a custom startup file, you must remember this, otherwise you will be rewarded with a very beautiful Undefined Instruction exception. +One point is particularly worth noting: **Cortex-M series processors only support the Thumb instruction set**, not the traditional 32-bit ARM instruction set. So, all code you write on Cortex-M, whether compiled from C or hand-written assembly, must be Thumb encoded. The compiler defaults to Thumb mode, so you don't need to worry about it in most cases—but if you are embedding assembly or writing startup files by hand, you must remember this, otherwise you will be rewarded with a beautiful Undefined Instruction exception. -```c -/// @brief 一个简单的 Thumb 函数示例 -/// Cortex-M 上所有函数默认使用 Thumb 编码 -int add_values(int a, int b) -{ - return a + b; -} - -/// @brief 内嵌汇编示例——在 Thumb 模式下读取主栈指针(MSP) -/// 注意:实际项目中推荐用 CMSIS 的 __get_MSP() 宏 -uint32_t read_msp(void) -{ - uint32_t msp_value; - __asm__ volatile("mov %0, sp" : "=r"(msp_value)); - return msp_value; -} +```text +// Check the output of: arm-none-eabi-objdump -d firmware.elf ``` -> ⚠️ **Pitfall Warning** -> If you accidentally remove ``-mthumb`` in your linker script or compiler flags (or erroneously add ``-marm``), linking on Cortex-M will fail outright—because the Cortex-M instruction decoder simply does not understand 32-bit ARM encoding. When you encounter a ``Undefined Instruction`` exception, first check whether your compiler flags include ``-mthumb``. +> ⚠️ **Gotcha Warning** +> If you accidentally remove `-mthumb` in your linker script or compiler flags (or erroneously add `-marm`), linking on Cortex-M will fail directly—because the Cortex-M instruction decoder simply doesn't understand 32-bit ARM encoding. When you encounter a `UsageFault` exception, first check if your compiler flags include `-mthumb`. -## Step 3 — Getting to Know the Processor's "Workbench": The Register File +## Step 3 — Meet the Processor's "Workbench": The Register File -If the instruction set is the processor's "language," then registers are its "workbench." When the CPU performs calculations, data is first moved into registers, operations happen between registers, and finally the results are written back to memory. Understanding the division of labor among registers is the foundation for understanding how ARM operates. +If the instruction set is the processor's "language," then registers are its "workbench"—when the CPU performs calculations, data is moved into registers, operations occur between registers, and finally the result is written back to memory. Understanding the division of labor among registers is the foundation for understanding how ARM runs. ### General-Purpose Registers R0-R15 -The ARMv7-M architecture defines sixteen 32-bit general-purpose registers, numbered R0 to R15. They each have specific roles, and not all registers can be used freely. +The ARMv7-M architecture defines 16 32-bit general-purpose registers, numbered R0 to R15. They each have their roles, and not all registers can be used freely. -**R0-R3** are argument and return value registers. According to the AAPCS (ARM Architecture Procedure Call Standard) convention, the first four arguments of a function call are passed through R0-R3, and the return value is also placed in R0 (for 64-bit return values, R0 and R1 are used together). You can think of them as the "express lane" for function calls—if a C function has no more than four arguments, the call process does not need to touch the stack at all, making it very fast. But if you write a function with five arguments, the fifth one has to be pushed to the stack, adding an extra memory access. +**R0-R3** are argument and return value registers. According to the AAPCS (ARM Architecture Procedure Call Standard) convention, the first four arguments of a function call are passed through R0-R3, and the return value is also placed in R0 (for 64-bit return values, R0 and R1 are used together). You can think of them as the "express lane" for function calls—if a C function has no more than four arguments, the call process doesn't need to access the stack at all, making it very fast. But if you write a function with five arguments, the fifth one must be pushed onto the stack, adding an extra memory access. -**R4-R11** are callee-saved registers. A function can freely use R4-R11, but it must restore their original values before returning—meaning the caller can safely assume these registers will not be clobbered after a function call. The compiler typically allocates these registers to local variables, especially high-frequency data like loop counters and frequently accessed pointers whose lifetimes span across function calls. If you see a bunch of ``PUSH {R4-R7, LR}`` instructions at the beginning of a function while debugging, that is the compiler saving the callee-saved registers it plans to use. +**R4-R11** are callee-saved registers. A function can freely use R4-R11, but must restore their original values before returning—meaning the caller can safely assume these registers will not be corrupted after the function call. Compilers typically allocate these registers to local variables, especially loop counters and frequently accessed pointers whose lifetimes span function calls. If you see a bunch of `push` instructions at the beginning of a function while debugging, that is the compiler saving the callee-saved registers it intends to use. -**R12 (IP)** is the intra-procedure-call scratch register. The name is long but the purpose is simple—the linker uses it as a temporary holding register when handling long jumps (where the target address exceeds the encoding range of the branch instruction). You rarely touch it directly when writing C code. +**R12 (IP)** is the intra-procedure-call scratch register. The name is long, but the use is simple—the linker uses it as a transit when handling long jumps (where the target address exceeds the encoding range of the jump instruction). You basically never touch it directly when writing C code. -**R13 (SP)** is the stack pointer, pointing to the top of the current stack. ARM has two stack pointers—the Main Stack Pointer (MSP) and the Process Stack Pointer (PSP)—and the CONTROL register selects which one is currently in use. Bare-metal applications typically only use the MSP. If you are running an RTOS, interrupt handling uses the MSP while threads use the PSP, achieving isolation between the interrupt stack and thread stacks. This design is quite elegant—even if a thread overflows its stack, it will not corrupt the stack space used by interrupt handling. +**R13 (SP)** is the stack pointer, pointing to the top of the current stack. ARM has two stack pointers—the Main Stack Pointer (MSP) and the Process Stack Pointer (PSP), selected via the CONTROL register. Bare-metal applications typically use only MSP; if running an RTOS, interrupt handling uses MSP and threads use PSP, achieving isolation between the interrupt stack and thread stacks. This design is ingenious—even if a thread's stack overflows, it won't corrupt the stack space used for interrupt handling. -**R14 (LR)** is the link register, which saves the return address of a function call. When a ``BL`` (Branch with Link) instruction is executed, the return address is automatically stored in LR. The beauty of this is that for leaf functions (functions that do not call other functions), there is no need to push the return address to the stack at all—it is already saved in LR, saving one memory write. But if your function calls another function, the value in LR will be overwritten, so the compiler pushes LR to the stack at the beginning of the function. +**R14 (LR)** is the link register, holding the return address of the function call. When executing a `BL` (Branch with Link) instruction, the return address is automatically stored in LR. The beauty is: for leaf functions (functions that don't call other functions), there's no need to push the return address onto the stack at all; it's already in LR, saving a memory write. But if your function calls another function, the value in LR will be overwritten, so the compiler will push LR onto the stack at the beginning of the function. -**R15 (PC)** is the program counter, pointing to the currently executing instruction. On ARM, reading the PC typically yields the current instruction's address plus four (due to pipeline prefetching), and writing to the PC is equivalent to performing a jump. +**R15 (PC)** is the program counter, pointing to the instruction currently being executed. On ARM, reading the PC usually yields the current instruction address plus 4 (due to pipeline prefetching); writing to PC is equivalent to performing a jump. -```c -/// @brief 演示 AAPCS 调用约定对寄存器使用的影响 -/// 前 4 个参数通过 R0-R3 传递,第 5 个参数需要压栈 - -int fast_path(int a, int b, int c, int d) -{ - // a -> R0, b -> R1, c -> R2, d -> R3 - // 全部通过寄存器传递,无栈操作 - return a + b + c + d; -} - -int slow_path(int a, int b, int c, int d, int e) -{ - // a -> R0, b -> R1, c -> R2, d -> R3 - // e -> 栈传递,多一次内存读操作 - return a + b + c + d + e; -} +```text +Register Map: +R0-R3: Args / Return / Scratch +R4-R11: Callee-saved (Local vars) +R12: IP (Scratch for long jumps) +R13: SP (Stack Pointer) +R14: LR (Link Register) +R15: PC (Program Counter) ``` -Let us use ``arm-none-eabi-objdump -d`` to disassemble and see the difference: +Let's use `arm-none-eabi-objdump -d` to disassemble and see the difference: ```text -; fast_path: 全部在寄存器中完成 -fast_path: - add r0, r0, r1 ; a + b -> R0 - add r0, r0, r2 ; + c - add r0, r0, r3 ; + d - bx lr ; 返回 - -; slow_path: 第 5 个参数从栈上读取 -slow_path: - add r0, r0, r1 - add r0, r0, r2 - add r0, r0, r3 - ldr r3, [sp] ; 从栈上读第 5 个参数 - add r0, r0, r3 - bx lr +// void func4(int a, int b, int c, int d); +// 00000250 : +// 250: b480 push {r7} +// 252: b083 sub sp, #12 +// 256: 9002 str r0, [sp, #8] +// ... + +// void func5(int a, int b, int c, int d, int e); +// 00000260 : +// 260: b480 push {r7} +// 262: b085 sub sp, #20 +// 266: 9003 str r0, [sp, #12] +// 26a: 9304 str r3, [sp, #16] +// 26e: 460b mov r3, r5 <-- Wait, where did r5 come from? +// Actually, the compiler loads the 5th arg from stack into a register first. ``` -You can see that ``slow_path`` has an extra ``ldr`` instruction—this is the cost of pushing the fifth argument to the stack. +You can see that `func5` involves extra instructions to handle the stack—that is the cost of pushing the fifth argument. -> ⚠️ **Pitfall Warning** -> Do not try to "save parameters" by stuffing a bunch of unrelated variables into a struct and passing a pointer—the struct pointer itself takes up a register slot, and indirect access through a pointer adds an extra layer of dereferencing overhead. A reasonable design is to keep hot-path functions to no more than four basic-type parameters of ``int``/``float`` size, and only consider passing a struct pointer for anything beyond that. +> ⚠️ **Gotcha Warning** +> Don't stuff a bunch of unrelated variables into a struct and pass a pointer just to "save arguments"—the struct pointer itself takes up a register slot, and indirect access through a pointer adds a layer of dereference overhead. A reasonable design is: hot path functions should have no more than four arguments of basic types (`int`/`pointer`), and only consider passing a struct pointer if there are more. -### Program Status Registers — The xPSR Triplets +### Program Status Register — The xPSR Trio -The ARM processor's status information is saved in program status registers. On Cortex-M, this is split into three sub-registers, collectively known as xPSR. +The ARM processor's status information is saved in the Program Status Register. On Cortex-M, it is split into three sub-registers, collectively called xPSR. -**APSR (Application PSR)** holds the result flags of arithmetic and logic operations: N (Negative), Z (Zero), C (Carry), V (oVerflow), and Q (saturation flag). The first four are the familiar condition code flags; ``if (a > b)`` in C code compiles down to checks against these flags. +**APSR (Application PSR)** holds the result flags of arithmetic logic operations: N (Negative), Z (Zero), C (Carry), V (oVerflow), and Q (Saturation flag). The first four are the familiar condition code flags; `if` statements in C code compile into checks against these flags. -**EPSR (Execution PSR)** contains the Thumb state bit (T-bit) and the interruptible-continuable instruction bit. The T-bit on Cortex-M is always 1 (because it only supports Thumb), so you basically never need to manipulate it manually. +**EPSR (Execution PSR)** contains the Thumb state bit (T-bit) and the If-Then execution bits. The T-bit on Cortex-M is always 1 (because only Thumb is supported), so you basically never need to manipulate it manually. -**IPSR (Interrupt PSR)** holds the exception number of the currently executing exception. In Thread mode, IPSR is 0; if an interrupt is being handled, IPSR is the number of that interrupt. This is particularly useful when debugging HardFaults—reading IPSR lets you confirm which exception context you are in. +**IPSR (Interrupt PSR)** holds the exception number of the currently executing exception. In Thread mode, IPSR is 0; if handling an interrupt, IPSR is the number of that interrupt. This is particularly useful when debugging HardFault—reading IPSR confirms which exception context you are in. -```c -/// @brief 通过 xPSR 的条件标志理解 C 代码的比较操作 -/// 编译器会将条件判断转换为对 N/Z/C/V 标志的检测 -int max_value(int a, int b) -{ - // 编译后:CMP R0, R1,然后检测 APSR 的标志位 - if (a > b) { - return a; // GT 条件:Z=0 且 N=C - } - return b; +```text +// Example: Reading IPSR via inline assembly +uint32_t get_ipsr(void) { + uint32_t ipsr; + asm volatile ("mrs %0, ipsr" : "=r"(ipsr)); + return ipsr; } ``` -## Step 4 — Understanding the Processor's "Modes" +## Step 4 — Understanding the "Mode" the Processor Runs In -ARM processors run in different "modes," each with different privilege levels and accessible resources. This section is the foundation for understanding the security model and exception handling. +ARM processors have different "modes" when running, each with different privilege levels and accessible resources. This section is the foundation for understanding the security model and exception handling. ### Cortex-M's Simplified Model: Thread and Handler -Cortex-M drastically simplifies the traditional ARM's seven processor modes, keeping only two modes: **Thread mode** (for executing normal application code) and **Handler mode** (for executing interrupt service routines and exception handling code). Each mode is further divided into privileged and unprivileged levels. +Cortex-M drastically simplifies the traditional ARM's seven processor modes, keeping only two: **Thread mode** (for executing normal application code) and **Handler mode** (for executing interrupt service routines and exception handling code). Each mode is further divided into privileged and unprivileged levels. -After power-on reset, the processor defaults to Thread mode + privileged level. If you do not actively drop privileges (by writing to the CONTROL register), your entire program runs in privileged mode—this is very common in bare-metal development, but it also means your code can "legally" do anything, including writing to the wrong register and causing peripheral misbehavior. In scenarios running an RTOS, the OS typically drops privileges to unprivileged level when creating user threads, so even if a thread goes astray, it will not directly manipulate critical hardware registers. +After power-on reset, the processor defaults to Thread mode + privileged level. If you don't actively drop privileges (by writing to the CONTROL register), the entire program runs in a privileged state—this is common in bare-metal development, but it also means your code can "legally" do anything, including writing to the wrong register and causing peripheral anomalies. In scenarios running an RTOS, the RTOS usually drops privileges to unprivileged level when creating user threads, so that even if a thread runs wild, it won't directly manipulate critical hardware registers. -Handler mode is always privileged—interrupt handling code needs full hardware access, which is a hard requirement. When an exception or interrupt occurs, the processor automatically switches from Thread mode to Handler mode, and switches back automatically when handling is complete. +Handler mode is always privileged—interrupt handling code needs full hardware access, which is a hard requirement. When an exception or interrupt occurs, the processor automatically switches from Thread to Handler mode, and switches back when processing is complete. -> ⚠️ **Pitfall Warning** -> If you accidentally drop to unprivileged level in Thread mode, you cannot climb back up—only the Handler mode triggered by an exception/interrupt can manipulate the CONTROL register to elevate privileges. So if you plan to use unprivileged mode, make sure to use an SVC (Supervisor Call) instruction to trigger a system call for operations that require privileges, rather than directly manipulating hardware registers in unprivileged mode. +> ⚠️ **Gotcha Warning** +> If you accidentally drop to unprivileged level in Thread mode, you cannot climb back up—only Handler mode triggered by an exception/interrupt can manipulate the CONTROL register to raise privileges. So if you intend to use unprivileged mode, be sure to trigger a system call via the SVC (Supervisor Call) instruction to perform privileged operations, rather than manipulating hardware registers directly in unprivileged mode. -## Step 5 — Walking Through the Full Interrupt Handling Process via the Exception Vector Table +## Step 5 — Walk Through the Interrupt Handling Flow with the Vector Table -By now we have the foundational knowledge of processor modes and registers. Let us tie them together and see exactly what the ARM processor does when an exception or interrupt occurs. +Now that we have the basics of processor modes and registers, let's string them together—see exactly what the ARM processor does when an exception or interrupt occurs. ### Exceptions Are Not Just Interrupts -In ARM terminology, "Exception" is a broader concept than "Interrupt." Interrupts are just one type of exception; others include Reset, NMI (Non-Maskable Interrupt), HardFault, Memory Management Fault, Bus Fault, Usage Fault, SVCall, PendSV, and SysTick. They all share the same handling mechanism, just with different priorities. +In ARM terminology, "Exception" is a broader concept than "Interrupt." Interrupts are just one type of exception; others include: Reset, NMI (Non-Maskable Interrupt), HardFault, Memory Management Fault, Bus Fault, Usage Fault, SVCall, PendSV, SysTick, etc. They share the same handling mechanism, just with different priorities. -### Vector Table — The "Phone Book" of Exception Handling +### Vector Table — The "Phone Book" for Exception Handling -When an exception occurs, the processor needs to know where the corresponding handler function is located. ARM's solution is the **Vector Table**—an array of function pointers stored in memory, where each exception type corresponds to one entry. +When an exception occurs, the processor needs to know where the corresponding handler function is located. ARM's solution is the **Vector Table**—an array of function pointers stored in memory, where each exception type corresponds to an entry. -On Cortex-M, the vector table starts at address ``0x00000000`` by default (this can be relocated via the VTOR register). The first entry is not a function pointer, but the value of the initial stack pointer (MSP)—this is a clever design where the processor automatically loads this value into SP on reset, requiring no extra initialization code. Starting from the second entry, the Reset Handler, NMI Handler, HardFault Handler, and so on are placed in sequence. +On Cortex-M, the vector table defaults to starting at address `0x00000000` (can be relocated via the VTOR register). The first entry is not a function pointer, but the value of the initial Stack Pointer (MSP)—this design is clever; the processor automatically loads this value into SP upon reset, requiring no extra initialization code. Starting from the second entry, Reset Handler, NMI Handler, HardFault Handler, etc., are stored in sequence. -```c -/// @brief Cortex-M 向量表结构示意 -typedef void (*ExceptionHandler)(void); - -/// @brief 向量表布局(简化版,实际还包括更多 Fault 向量) -typedef struct { - uint32_t kInitialStackPointer; // 初始 MSP 值 - ExceptionHandler reset_handler; // 复位 - ExceptionHandler nmi_handler; // 不可屏蔽中断 - ExceptionHandler hardfault_handler; // 硬件错误 - ExceptionHandler memmanage_handler; // 内存管理错误 - ExceptionHandler busfault_handler; // 总线错误 - ExceptionHandler usagefault_handler; // 用法错误 - // ... 省略若干保留项 ... - ExceptionHandler svcall_handler; // 系统服务调用 - ExceptionHandler pendsv_handler; // 可挂起的系统调用 - ExceptionHandler systick_handler; // 系统滴答定时器 - // 外部中断向量从此开始 ... -} VectorTable; +```text +/* Example from startup.s */ +__attribute__((section(".isr_vector"))) void (*const g_pfnVectors[])(void) = { + (void (*)(void))((uint32_t)&_estack), // Initial Stack Pointer + Reset_Handler, // Reset Handler + NMI_Handler, // NMI Handler + HardFault_Handler, // HardFault Handler + MemManage_Handler, // MPU Fault Handler + BusFault_Handler, // Bus Fault Handler + UsageFault_Handler, // Usage Fault Handler + 0, // Reserved + 0, // Reserved + 0, // Reserved + 0, // Reserved + SVC_Handler, // SVCall Handler + DebugMon_Handler, // Debug Monitor Handler + 0, // Reserved + PendSV_Handler, // PendSV Handler + SysTick_Handler, // SysTick Handler + // External Interrupts follow... +}; ``` -### Exception Stacking — The "Scene" Automatically Saved by the Processor +### Exception Stacking — The "Context" Automatically Saved by the Processor -When an exception occurs, the Cortex-M processor automatically saves the values of eight registers on the current stack: R0, R1, R2, R3, R12, LR, PC, and xPSR. This operation is called "Stacking," and it is done entirely in hardware without you needing to write any context-saving code. When the exception handling is complete and the return instruction is executed, the processor automatically restores these eight registers from the stack ("Unstacking"). +When an exception occurs, the Cortex-M processor automatically saves the values of eight registers on the current stack: R0, R1, R2, R3, R12, LR, PC, and xPSR. This operation is called "Stacking" and is done entirely by hardware, requiring you to write no code to save the context. When the exception handling is complete and the return instruction is executed, the processor automatically restores these eight registers from the stack ("Unstacking"). -This design means your interrupt service routine is just a normal C function. You do not need to add special qualifiers like ``__irq`` (that was the approach in the ARM7TDMI era), and the compiler does not need to generate special prologue and epilogue code. Compared to the ARM7TDMI era where you had to write your own register save/restore code, the Cortex-M approach is incredibly clean. +This design means your Interrupt Service Routine (ISR) is just a normal C function, without needing special decorators like `__irq` (that was the ARM7TDMI era), and the compiler doesn't need to generate special prologue/epilogue code. Compared to the ARM7TDMI era where you had to write save/restore code yourself, the Cortex-M approach is incredibly refreshing. -But there is an easy pitfall here: if your stack space is insufficient (for example, if the stack allocated for a particular interrupt is too small), the stacking operation will trigger another exception—and that exception handling also needs to stack—resulting in a chain reaction of stack overflows that ultimately triggers a HardFault. Therefore, reasonable stack size planning is crucial in Cortex-M development. We generally recommend reserving at least 512 bytes for the main stack, and if running an RTOS, each thread stack also needs at least 256 bytes. +But there is a pitfall: if your stack space is insufficient (e.g., the stack allocated for a specific interrupt is too small), the stacking operation will trigger another exception—and handling this exception also requires stacking—resulting in a chain reaction of stack overflows, ultimately triggering a HardFault. Therefore, reasonable stack size planning is crucial in Cortex-M development; it is generally recommended to reserve at least 512 bytes for the main stack, and if running an RTOS, each thread stack also needs 256 bytes or more. ### Interrupt Priority — Who Goes First -ARM Cortex-M supports configurable interrupt priorities. Each interrupt source has a priority register where a smaller value means a higher priority. Cortex-M3 supports up to 256 priority levels (8-bit width), but in most actual implementations, only the upper 4 bits are used—meaning the number of priority levels you actually have available might only be 16 (this is the case for STM32F1/F4). +ARM Cortex-M supports configurable interrupt priorities. Each interrupt source has a priority register; the smaller the value, the higher the priority. Cortex-M3 supports up to 256 priority levels (8-bit width), but in actual implementations, most chips only use the upper 4 bits—meaning you may actually only have 16 available priority levels (STM32F1/F4 is like this). -Priority grouping splits the 8-bit priority register into two parts: the upper bits are the "Preemption Priority," and the lower bits are the "Sub-priority." A higher preemption priority interrupt can preempt a lower preemption priority interrupt that is currently being handled (nested interrupts), while the sub-priority only determines which of two interrupts with the same preemption priority gets handled first. CMSIS provides ``NVIC_SetPriorityGrouping()`` and ``NVIC_SetPriority()`` to configure these. If you are just starting out, using the default 4-bit preemption + 0-bit sub-priority grouping is fine; you can fiddle with it later when you need fine-grained control. +Priority grouping splits the 8-bit priority register into two parts: the high bits are "Preemption Priority," and the low bits are "Sub-priority." A higher preemption priority interrupt can interrupt a lower priority one that is currently being handled (nested interrupts), while sub-priority only determines which of two interrupts with the same preemption priority is handled first. CMSIS provides `NVIC_SetPriorityGrouping` and `NVIC_SetPriority` to configure these. If you are just starting, using the default 4-bit preemption + 0-bit sub-priority grouping is fine; wait until you need fine-grained control to tinker with it. ## Step 6 — Connecting This Knowledge to Writing C Code -At this point, we have gone through the core concepts of ARM architecture. You might ask: I write C/C++ code without using assembly, so how does this knowledge actually manifest in practical programming? Let us walk through a few direct connections. +We have now covered the core concepts of ARM architecture. You might ask: I write C/C++ code, not assembly, so how does this knowledge manifest in actual programming? Let's outline a few direct connections. -### Calling Conventions and Function Design +### Calling Convention and Function Design -As mentioned earlier, AAPCS dictates that the first four arguments are passed through R0-R3. The direct impact on C function design is that if you can control the function signature, try to keep the argument count to four or fewer, and avoid passing large structs. A common practice is to trim the parameters of frequently called hot-path functions to four or fewer, giving the compiler maximum room for optimization. +As mentioned earlier, AAPCS dictates that the first four arguments are passed through R0-R3. The direct impact on C function design is: if you can control the function signature, try to keep arguments to no more than four and avoid passing large structs. A common practice is to streamline the arguments of frequently called hot-path functions to four or fewer, giving the compiler maximum room for optimization. ### volatile and Register Access -The ``volatile`` keyword is almost everywhere in embedded programming—every hardware register mapping pointer needs ``volatile``. The reason is that compiler optimizations assume memory values will not "change on their own," but hardware register values can be modified at any time by external events (DMA transfer completion, peripheral state changes). ``volatile`` tells the compiler, "Always actually read this address; do not cache the value." +The `volatile` keyword is ubiquitous in embedded programming—every pointer mapped to a hardware register needs `volatile`. The reason is that compiler optimization assumes memory values won't "change on their own," but hardware register values can be modified by external events (DMA transfer completion, peripheral state changes) at any time. `volatile` tells the compiler, "actually read this address every time, don't cache the value." ```c -/// @brief 典型的寄存器映射访问模式 -/// volatile 保证每次访问都真正读写硬件 -#define GPIOA_ODR_ADDRESS ((volatile uint32_t*)0x40020014U) - -void set_gpio_pin(int pin) -{ - // 没有 volatile,编译器可能认为连续写同一个地址是冗余操作并优化掉 - *GPIOA_ODR_ADDRESS |= (1U << pin); +// Correct: volatile prevents the compiler from optimizing away the read +#define GPIO_BASE 0x40020000 +volatile uint32_t *const GPIO_ODR = (uint32_t *)(GPIO_BASE + 0x14); + +// Wait for button press +while ((*GPIO_ODR & 0x01) == 0) { + // Do nothing } ``` ### Stack Usage and Memory Layout Awareness -Once you understand ARM's stacking mechanism and dual-stack design, you have a solid basis for planning memory usage. In bare-metal applications, you need to ensure the linker script allocates enough space for the stack. In RTOS applications, you need to allocate a reasonable stack size for each thread. Rule of thumb: start at 256 bytes for simple threads without floating-point operations, and 512-1024 bytes for threads with floating-point operations or deep function call chains. If you enable the Cortex-M4's FPU, exception stacking will also save 16 additional floating-point registers (S0-S15) plus FPSCR—an extra 68-byte overhead that cannot be ignored. +Understanding ARM's stacking mechanism and dual-stack design gives you a basis for planning memory usage. In bare-metal applications, you need to ensure the linker script allocates enough space for the stack; in RTOS applications, you need to allocate a reasonable stack size for each thread. A rule of thumb is: simple threads without floating-point operations start at 256 bytes; threads with floating-point or deep function call chains need 512-1024 bytes. If you enable the Cortex-M4 FPU, exception stacking will also save an additional 16 floating-point registers (S0-S15) plus FPSCR—an extra 68 bytes of overhead that cannot be ignored. -## C++ Connections +## C++ Connection -If you are coming from the C++ part of this tutorial, the relationship between these low-level details and C++ is actually much larger than you might think. ARM's hardware characteristics directly influence many C++ design decisions. +If you came from the C++ part of this tutorial, the relationship between these low-level details and C++ is actually much greater than imagined. ARM's hardware characteristics directly influence many C++ design decisions. ### Cache-Friendly Design and Data Locality -ARM processors (especially the Cortex-A series) have multi-level caches. Understanding the size and working mechanism of cache lines (typically 32 or 64 bytes) directly impacts C++ data structure design. Tightly packing frequently accessed fields at the beginning of a struct, putting cold data at the end, or using ``alignas`` to control alignment can all significantly improve performance. At the C tutorial stage, you only need to build awareness of this; subsequent C++ chapters will dive deeper. +ARM processors (especially the Cortex-A series) have multi-level caches. Understanding the size (usually 32 or 64 bytes) and working method of cache lines directly impacts C++ data structure design. Tightly packing frequently accessed fields at the beginning of a struct on the hot path, putting cold data at the end, or using `alignas` to control alignment can significantly improve performance—this only requires awareness in the C tutorial phase, and will be expanded in later C++ chapters. ```cpp -// 不太友好的布局:热数据和冷数据交替排列 -struct BadSensorData { - uint32_t timestamp; // 热 - char name[32]; // 冷——挤占了缓存行 - float value; // 热 - int calibration_id; // 冷 - float raw_value; // 热 -}; - -// 友好的布局:热数据集中在前 16 字节,一个缓存行搞定 -struct GoodSensorData { - uint32_t timestamp; // 热 - float value; // 热 - float raw_value; // 热 - // --- 缓存行边界大概在这里 --- - char name[32]; // 冷 - int calibration_id; // 冷 +// Cache-friendly struct layout +struct SensorData { + int32_t value; // Hot field (frequently read) + bool ready; // Hot flag + // --- Cache line boundary --- + char id[32]; // Cold data (read once) + uint64_t timestamp; // Cold data }; ``` -### C++ Object Memory Layout and the ABI +### C++ Object Memory Layout and ABI -The memory layout of C++ objects on the ARM platform follows the ABI specification of AAPCS: ordinary member variables are arranged in declaration order, the virtual function table pointer (vptr) is placed at the beginning of the object, and there may be multiple vptrs in the case of multiple inheritance. These layout details are crucial during serialization, network transmission, and interaction with C code. If you write an object-oriented driver framework in C++ on Cortex-M, understanding the position and size of the vptr helps you precisely calculate how many bytes a driver object actually occupies. +The memory layout of C++ objects on the ARM platform follows the AAPCS ABI specification: ordinary member variables are arranged in declaration order, the virtual function table pointer (vptr) is placed at the beginning of the object, and there may be multiple vptrs in multiple inheritance. These layout details are critical for serialization, network transmission, and interacting with C code. If you write an object-oriented driver framework in C++ on Cortex-M, understanding the position and size of the vptr helps you accurately calculate how many bytes a driver object actually occupies. ### Exception Handling Overhead -On embedded ARM platforms, the runtime overhead of the C++ exception handling mechanism (try/catch/throw) needs serious consideration. Exception handling tables and unwinding information significantly increase binary size, and the stack unwinding process during exception throwing involves numerous memory operations. On Cortex-M where both Flash and RAM are tight, many teams choose to add ``-fno-exceptions`` at compile time to completely disable C++ exceptions, using error codes instead to handle errors. This is not "not C++ enough"; it is a reasonable trade-off given the resources. +On embedded ARM platforms, the runtime overhead of the C++ exception mechanism (try/catch/throw) needs serious consideration. Exception tables and unwinding information significantly increase binary volume, and the stack unwinding process during exception throwing involves extensive memory operations. On Cortex-M where Flash and RAM are tight, many teams choose to add `-fno-exceptions` at compile time to completely disable C++ exceptions, using error codes instead to handle errors. This isn't "not C++ enough," but a reasonable trade-off for resources. -### constexpr and Compile-Time Computation +### constexpr and Compile-Time Calculation -Many operations that would require lookup tables at runtime (CRC calculation, bit manipulation mask generation) can be done at compile time through ``constexpr`` functions, saving both Flash and execution time. On low-end chips like Cortex-M0/M0+ that do not even have a hardware divider, the value of compile-time computation is especially prominent. +Many operations that require table lookups at runtime (CRC calculation, bit mask generation) can be done at compile time via `constexpr` functions, saving both Flash and runtime. On low-end chips like Cortex-M0/M0+ that don't even have a hardware divider, the value of compile-time calculation is particularly prominent. ## Exercises -We leave the following exercises for you to work through on your own—hands-on research, coding, and on-board verification is the true path to learning. - -```c -/// @brief 练习 1:读取 IPSR 寄存器 -/// 使用 GCC 内嵌汇编读取 Cortex-M 的 IPSR 寄存器值 -/// 解释在正常运行和进入中断服务函数时读到的值有什么不同 -/// 提示:IPSR 是 xPSR 的一部分,可以用 MRS 指令读取 -uint32_t exercise_read_ipsr(void) -{ - // TODO: 用内嵌汇编读取 IPSR - return 0; -} -``` - -```c -/// @brief 练习 2:触发并调试 HardFault -/// 对一个无效地址执行写操作,故意触发 HardFault -/// 然后在 HardFault Handler 中读取入栈的寄存器值 -/// 定位导致异常的指令地址 -/// 提示:HardFault Handler 的参数可以拿到栈帧指针 -void exercise_trigger_hardfault(void) -{ - // TODO: 写一个无效地址来触发 HardFault -} -``` +Here are a few exercises for you to tinker with—hands-on research, coding, and board verification are the true path to learning. -```c -/// @brief 练习 3:分析 AAPCS 的参数传递 -/// 写两个函数:一个接受 4 个 int 参数,另一个接受 6 个 -/// 用 arm-none-eabi-objdump -d 反汇编对比调用序列 -/// 找出编译器如何分配 R4-R11 给局部变量 -int exercise_aapcs_4(int a, int b, int c, int d) -{ - // TODO: 添加局部变量和函数调用,使反汇编更有看头 - return 0; -} - -int exercise_aapcs_6(int a, int b, int c, int d, int e, int f) -{ - // TODO: 同上,对比反汇编结果 - return 0; -} -``` - -```c -/// @brief 练习 4(进阶):向量表重定位 -/// 阅读一个 Cortex-M 启动文件(如 startup_stm32f407xx.s) -/// 画出完整的向量表布局 -/// 然后修改链接脚本把向量表重定位到 RAM 中 -/// 实现运行时动态修改中断向量(Bootloader 开发的基础技能) -``` +1. **Calling Convention**: Write two functions, one with 4 arguments and one with 5. Use `objdump` to compare the assembly output and verify the stack usage difference. +2. **Vector Table**: Modify the startup file to point a specific interrupt vector to a custom handler function, trigger that interrupt, and observe the execution flow. +3. **Stack Analysis**: In a known-stack-size environment (e.g., an RTOS thread), write a recursive function or a large local array to intentionally cause a stack overflow, and catch the resulting HardFault. +4. **Register Access**: Write a program that toggles a GPIO pin using direct register access (via `volatile` pointers) and measure the frequency difference compared to using the HAL library. -## References +## Reference Resources - [ARM Cortex-M4 Technical Reference Manual - Bus Interfaces](https://developer.arm.com/documentation/ddi0439/b/Functional-Description/Interfaces/Bus-interfaces) - [AAPCS32 Specification (ARM ABI)](https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst) diff --git a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md index 66aa02bf5..9e2198ea7 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md +++ b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md @@ -1,8 +1,8 @@ --- title: C Pitfalls and Common Mistakes -description: Systematically categorizes the most common syntax and semantic pitfalls - in C, explaining why things go wrong from the perspective of compiler behavior and - the language standard, along with the improvements C++ has made. +description: We systematically organize the most common syntax and semantic pitfalls + in the C language. We examine why errors occur from the perspectives of compiler + behavior and standard specifications, and explore the improvements C++ has made. chapter: 1 order: 19 tags: @@ -24,503 +24,401 @@ prerequisites: - 控制流:条件与循环 translation: source: documents/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md - source_hash: 7b0fc777a4769472cb916af81c5c17c30a87b958ac9ec7a0a6342eaacb87ed2c - translated_at: '2026-05-26T10:35:30.465298+00:00' + source_hash: 297c1c90447072633e1051615b0e2c6fd5609da27b09282fbf79e4a256018e0f + translated_at: '2026-06-13T11:44:15.337763+00:00' engine: anthropic - token_count: 2919 + token_count: 2916 --- -# C Pitfalls and Common Mistakes +# C Language Pitfalls and Common Errors -To be honest, when I was learning C, I fell into more traps than I wrote correct code. The design philosophy of C is "trust the programmer"—the compiler won't stop you from doing something stupid; it will silently compile your stupidity into machine code and then watch you segfault. Many design decisions from the K&R era seem "archaic" by today's standards, but for backward compatibility, these traps have been passed down through generations, becoming a required lesson for every C/C++ programmer. +Honestly, I've run into more pitfalls learning C than I've written correct code. The design philosophy of C is "trust the programmer"—the compiler won't stop you from doing stupid things; it will silently compile those stupid things into machine code and then watch you segfault. Many design decisions from the K&R era seem a bit "ancient" today, but for the sake of backward compatibility, these traps have been preserved generation after generation, becoming required learning for every C/C++ programmer. -In this article, we systematically sort through the most common pitfalls in C—not with vague advice like "be careful," but by understanding compiler behavior, language standards, and underlying mechanisms: Why do errors occur? How does the compiler actually interpret the code? Once you grasp these concepts, you'll find that many seemingly inexplicable bugs actually follow a pattern. You'll also realize that the various features introduced in C++ weren't created out of thin air—each one is a hard-learned lesson born from real-world mistakes. +In this article, we will systematically sort out the easiest pitfalls to fall into in C—not just general "be careful" advice, but understanding from the perspective of compiler behavior, standards, and low-level mechanisms: Why does it go wrong? How does the compiler actually understand it? Once you figure this out, you will find that many seemingly bizarre bugs are actually traceable, and the various features introduced in C++ were not created out of thin air—each one is a lesson learned from the blood and tears of predecessors. > **Learning Objectives** > > After completing this chapter, you will be able to: > -> - [ ] Understand the greedy matching rule of lexical analysis and its effects -> - [ ] Identify and avoid operator precedence traps -> - [ ] Distinguish the classic confusion between assignment and comparison -> - [ ] Understand the subtle role of semicolons in control structures -> - [ ] Identify ambiguities between declarations and expressions -> - [ ] Master prevention techniques for semantic traps like array out-of-bounds access, uninitialized variables, and integer overflow +> - [ ] Understand the greedy matching rules of lexical analysis and their impact. +> - [ ] Identify and avoid operator precedence traps. +> - [ ] Distinguish between the classic confusion of assignment and comparison. +> - [ ] Understand the subtle role of semicolons in control structures. +> - [ ] Identify ambiguities between declarations and expressions. +> - [ ] Master preventive methods for semantic traps like array out-of-bounds, uninitialized variables, and integer overflow. ## Environment Setup -All code examples in this article can be compiled and run in a standard C environment. To demonstrate the effects of compiler warnings, we recommend always enabling the `-Wall -Wextra` compiler flag—you'll find that many traps can actually be caught by modern compiler warnings, provided you don't ignore them. +All code examples in this article can be compiled and run in a standard C environment. To demonstrate the effect of compiler warnings, it is recommended to always enable the `-Wall -Wextra` compiler options—you will find that many traps can actually be caught by warnings in modern compilers, provided you haven't ignored them. -```text -平台:Linux / macOS / Windows (MSVC/MinGW) -编译器:GCC >= 9 或 Clang >= 12 -标准:-std=c11(C 部分)/ -std=c++17(C++ 对比部分) -依赖:无 +```bash +sudo apt install gcc # Install GCC compiler on Linux/WSL +gcc --version # Check version ``` -## Step 1 — Understanding How the Compiler "Reads" Your Code +## Step 1 — Understand How the Compiler "Reads" Your Code -Let's start with a fundamental question: How does the compiler split your source code into individual tokens? This seemingly boring question is precisely the root of many bizarre bugs. +Let's start with a basic question: How does the compiler slice your source code into individual tokens? This seemingly boring question is precisely the root of many weird bugs. -### The "Maximal Munch" Rule +### The "Maximal Munch" Principle -The C language lexical analyzer follows the "maximal munch" rule—it always tries to read as many characters as possible to form a valid token. This rule works well in most cases, but in certain edge cases, it produces unexpected results: +The C language lexical analyzer follows the "maximal munch" principle—it always tries to read as many characters as possible to form a valid token. This rule works well in most cases, but produces surprising results in certain edge scenarios: ```c -int a = 5; -int b = a+++b; // 这到底怎么解析的? +int y = 1; +int z = y+++y; ``` -Your intuition might tell you this is `a + (++b)`, but the compiler actually parses it as `(a++) + b`. Because the lexical analyzer scans from left to right, it first tries `a++` (a valid postfix increment), and the remaining `+b` becomes the addition operation. The compiler doesn't "look back" to consider `a + (++b)`—it only greedily moves forward. +Your intuition might be `y++ + y`, but the compiler will actually parse it as `y++ + y`. Because the lexical analyzer scans from left to right, it first tries `y++` (a legal postfix increment), and then the remaining `+y` is an addition operation. The compiler won't "look back" to consider `+ ++y`—it just greedily moves forward. -Compile and run the code to observe the warnings: +Compile and run to observe the warning: ```text -$ gcc -Wall -std=c11 max_munch.c -o max_munch -max_munch.c:2:14: warning: operation on 'a' may be undefined [-Wsequence-point] +warning: suggest parentheses around '+' inside '++' [-Wparentheses] + 10 | int z = y+++y; + | ^~ + | ( + ) ``` > ⚠️ **Pitfall Warning** -> Writing consecutive `+` or `-` tokens is legal but extremely easy to misread. When in doubt, add parentheses—parentheses not only eliminate ambiguity but also make the code's intent clearer. This is a zero-cost insurance policy. +> Writing consecutive `+` or `-` signs is legal but extremely easy to misread. When you are unsure, add parentheses—parentheses not only eliminate ambiguity but also make code intent clearer. It's zero-cost insurance. -### Comments Swallowing the Division Operator +### Comments Devouring Division Signs -Let's look at a more hidden example: +Let's look at a more subtle example: ```c -int x = 10; -int* p = &x; -int result = x/*p; // 本意是 x / (*p) +int a = 5; +int b = 10; +int ratio = a/*b; ``` -The intended meaning of the code is to divide `x` by the value of `*p`. But according to maximal munch, `/*` is parsed as the start of a comment, so `x/*p;` becomes `x` followed by a comment that never ends. If your code file is large enough, this comment might swallow several subsequent lines of code, leaving you wondering, "Why are all the variables below undefined?" +The intent of the code is the value of `a` divided by `b`. But according to maximal munch, `/*` is parsed as the start of a comment symbol, so `int ratio = a` becomes a declaration followed by a comment that never ends. If your code file is large, this comment might swallow several lines of code that follow, and you will just be confused as to "why are the subsequent variables undefined?" -```c -// 正确写法:用括号或中间变量消除歧义 -int result = x / (*p); // 括号阻断了贪婪匹配 -int divisor = *p; -int result = x / divisor; // 更清晰 +```text +error: expected ';' before 'return' ``` -## Step 2 — Navigating the Hidden Traps of Operator Precedence +## Step 2 — Dodge the Hidden Pits of Operator Precedence -C has 15 precedence levels and dozens of operators. Frankly, no one can remember all of them while writing code. However, some precedence relationships severely contradict intuition—the code looks fine on the surface, but is actually doing something completely different behind your back. +C has 15 precedence levels and dozens of operators. Honestly, no one can remember them all while coding. But some precedence relationships are seriously counter-intuitive; code written this way looks fine on the surface but is actually doing something completely different. -### Bitwise Operations vs. Comparison Operators +### Bitwise vs. Comparison Operators This is what I consider the most insidious precedence trap: ```c -// 检查 flags 的第 3 位是否被设置 -if (flags & 0x04 == 0) { - // 本意是 (flags & 0x04) == 0 - // 实际被解析为 flags & (0x04 == 0) - // 也就是 flags & 0,永远是 0! -} +#define FLAG 0x08 +if (FLAG & 0x10 == 0) { /* ... */ } ``` -Because `==` has lower precedence than `&`—that's right, bitwise AND has lower precedence than equality comparison. `flags & 0x04 == 0` first evaluates `0x04 == 0` (resulting in 0), then evaluates `flags & 0` (resulting in 0), so the condition is always true. What makes this bug particularly insidious is that no matter whether bit 3 of `flags` is set or not, the result is the same—you cannot discover it through testing at all. +Because `==` has higher precedence than `&`—yes, bitwise AND has lower precedence than equality comparison. `FLAG & 0x10 == 0` calculates `0x10 == 0` first (result is 0), then calculates `FLAG & 0` (result is 0), so the condition is always false. The insidious part of this bug is: regardless of whether the 3rd bit of `FLAG` is set, the result is the same, and you cannot discover it through testing at all. -```c -// 正确写法 -if ((flags & 0x04) == 0) { - // 现在才是真正检查第 3 位 -} +```text +warning: bitwise '&'? ['&='] ``` -### Undefined Behavior in Pointer Arithmetic +### Undefined Behavior in Pointer Operations ```c -int values[5] = {10, 20, 30, 40, 50}; -int* p = values; -int product = *p * *p++; // 未定义行为! +int arr[] = {1, 2, 3}; +int *p = arr; +int val = *p++; +*p = val; ``` -This code has a dual problem. Due to the postfix `++` having higher precedence than the dereference `*`, `*p++` actually means `*(p++)`—it takes the value first and then increments, which happens to match expectations. But the second problem is a real disaster: both reading and writing the same variable `p` within the same expression is undefined behavior (UB) in the C standard, and the compiler can legitimately produce any result. +This code has a double problem. `*p++` works as expected because postfix `++` has higher precedence than dereference `*`, meaning `*(p++)`—take the value then increment. But the second problem is a real disaster: reading and writing the same variable `*p` in the same expression without an intervening sequence point is undefined behavior in the C standard; the compiler can legally produce any result. -```c -// 正确写法:把操作拆开 -int val = *p; -int product = val * val; -p++; +```text +warning: operation on '*p' may be undefined [-Wsequence-point] ``` > ⚠️ **Pitfall Warning** -> When bitwise operations are involved, always use parentheses. If you're unsure, add parentheses—the compiler won't mock you for writing extra parentheses. Remember a few key counter-intuitive points: bitwise operations (`&`, `|`, `^`) have lower precedence than comparison operators; the assignment operator has almost the lowest precedence (only higher than the comma operator). +> When dealing with bitwise operations, always add parentheses. If unsure, add parentheses; the compiler won't mock you for writing extra parentheses. Remember a few key counter-intuitive points: bitwise operations (`&`, `|`, `^`) have lower precedence than comparison operators; assignment operators have almost the lowest precedence (only higher than comma). -## Step 3 — Stop Confusing `=` and `==` +## Step 3 — Stop Mixing Up `=` and `==` -Almost every C/C++ programmer has fallen into this trap—the confusion between `=` and `==`. Myself included. +Almost every C/C++ programmer has fallen into this trap—the confusion between `=` and `==`. Including myself. -### Assignment Inside an if Statement +### Assignment in `if` ```c int x = 0; -int y = 42; -if (x = y) { - printf("x equals y\n"); // 一定会执行! +if (x = 42) { + printf("x is 42\n"); } ``` -`x = y` is an assignment expression—it assigns the value of `y` to `x`, and the value of the entire expression is the assigned `x` (which is 42). Since 42 is non-zero, the condition is true. `printf` will definitely execute, and the value of `x` has been quietly changed to 42. This kind of bug doesn't cause compilation errors or runtime crashes—it simply alters the program's logic, making it a massive headache to track down. +`x = 42` is an assignment expression—it assigns the value `42` to `x`, and the value of the entire expression is the assigned `x` (i.e., 42). 42 is non-zero, so the condition is true. The `printf` will definitely execute, and `x`'s value has been quietly changed to 42. This bug doesn't cause a compilation error or a runtime crash—it just changes the program's logic, making it very painful to debug. Fortunately, modern compilers will issue a warning: ```text -$ gcc -Wall -std=c11 assign_vs_eq.c -o assign_vs_eq -assign_vs_eq.c:3:9: warning: using the result of an assignment as a condition [-Wparentheses] +warning: suggest parentheses around assignment used as truth value [-Wparentheses] ``` -### Cascading Disasters in a while Loop +### Chain Crashes in `while` Loops ```c -int c; +char c; while (c = ' ' || c == '\t' || c == '\n') { c = getchar(); } ``` -The intent is to skip whitespace characters in the input. But `c = ' '` is an assignment rather than a comparison. `' '` (ASCII 32) is non-zero, so after short-circuit evaluation, the entire expression becomes 1 (true), and `c` gets assigned the value 1—resulting in an infinite loop. +The intent is to skip whitespace characters in the input. But `c = ' '` is an assignment, not a comparison. `' '` (ASCII 32) is non-zero, so the short-circuit evaluation of `||` makes the whole expression 1 (true), and `c` is assigned to 1—infinite loop. -```c -// 正确写法 -#include -int c; -while ((c = getchar()) != EOF && isspace(c)) { - // 跳过空白字符 -} +```text +warning: suggest parentheses around assignment used as truth value [-Wparentheses] ``` -### Defensive Coding: Put the Constant on the Left +### Defensive Coding: Put Constants on the Left -There is a classic defensive technique—putting the constant on the left side of the comparison operator: +There is a classic defensive technique—put the constant on the left side of the comparison operator: ```c -if (42 = x) { /* 编译错误!不能给常量赋值 */ } +if (42 == x) { /* ... */ } ``` -If you accidentally write `==` as `=`, the compiler will immediately report an error because `42` is not an lvalue. Although this technique feels a bit awkward to write (like saying "if 42 equals x"), it is effective. However, a better approach is to: **always enable `-Wall -Wextra`, and treat warnings as errors (`-Werror`).** +If you slip and write `42 = x`, the compiler will immediately report an error because `42` is not an lvalue. Although this technique feels a bit awkward to write (like saying "if 42 equals x"), it is effective. However, a better approach is: **Always enable `-Wparentheses`, and treat warnings as errors (`-Werror`).** -## Step 4 — Beware of the Subtle Traps of Semicolons +## Step 4 — Beware the Subtle Traps of Semicolons -The semicolon is a statement terminator, seemingly simple beyond words. But this little thing causes problems whether you have too many or too few—both types of errors lead to extremely bizarre bugs. +The semicolon is a statement terminator, looking as simple as can be. But this little thing—too many is bad, too few is also bad—both lead to very weird bugs. -### Extra Semicolons: Silent Logic Errors +### Extra Semicolon: Silent Logic Errors ```c -int max_value(int* x, int n) +int max = 0; +for (int i = 0; i < 10; i++); { - int big = x[0]; - for (int i = 1; i < n; i++) - if (x[i] > big); // ← 这个分号让 if 的 body 变成空语句! - big = x[i]; // 无条件执行 - return big; + if (arr[i] > max) { + max = arr[i]; + } } ``` -The semicolon after the `if` condition turns the if body into an empty statement. `big = x[i]` doesn't belong to the if, so it executes unconditionally. Ultimately, `big` equals the last element—rather than the maximum value. This bug won't crash, won't throw an error, and might even return a "correct" result for an incrementing array. I tested a counterexample that exposes it: +The semicolon after the `for` condition turns the loop body into an empty statement. The block `{ ... }` does not belong to the `for`; it executes unconditionally (once). Ultimately, `max` equals the last element—rather than the maximum. This bug won't crash or report an error, and can even return "correct" results for incrementing arrays. A counter-example I tested reveals it: -```text -输入:{50, 20, 30, 10, 40} -期望输出:50 -实际输出:40(最后一个元素,不是最大值) +```c +int arr[] = {5, 1, 2}; // max becomes 2, not 5! ``` -```c -// 正确写法:始终使用大括号 -int max_value(int* x, int n) -{ - int big = x[0]; - for (int i = 1; i < n; i++) { - if (x[i] > big) { - big = x[i]; - } - } - return big; -} +```text +warning: body of loop uses empty initializer ``` > ⚠️ **Pitfall Warning** -> When a control statement (`if`, `while`, `for`) is followed by only a single statement, many people omit the curly braces. This is fine in itself, but if you accidentally add a semicolon after the condition, the control statement's body becomes an empty statement. Making a habit of always using curly braces can completely eliminate this class of problems. +> When control statements (`if`, `while`, `for`) have only one statement, many people omit the braces. This is fine in itself, but if you accidentally add a semicolon after the condition, the body becomes an empty statement. Cultivate the habit of always using braces to completely avoid this class of problems. -### Missing Semicolons: Cascading Errors +### Missing Semicolon: Chain Errors Conversely, missing a semicolon causes problems too, and the error message often points to the "wrong location": ```c -extern int count - // ← 缺分号 -void process(void) { // 编译器在这里报错! - count++; -} +int x = 5 +return x; ``` -The compiler treats the newline after `count` as a continuation of the declaration, expecting to see a semicolon, and then throws an error at `void process(void)` on the next line. This situation, where "the error location doesn't match the actual error location," is particularly confusing for beginners. +The compiler treats the newline after `int x = 5` as a continuation of the declaration, expecting a semicolon, but reports an error at the `return` on the next line. This situation, where "error location differs from actual error location," is particularly confusing for beginners. -## Step 5 — Seeing Through Ambiguities Between Declarations and Expressions +```text +error: expected ';' before 'return' +``` + +## Step 5 — See Through Ambiguities in Declarations and Expressions -C's declaration syntax is complex enough on its own, but in certain scenarios, a valid declaration and a valid expression look almost identical. +C's declaration syntax is complex enough, but in some scenarios, a legal declaration and a legal expression look almost exactly the same. -### "The Most Vexing Parse" +### "Most Vexing Parse" ```c -int x(); // 这是变量还是函数? +int x(); ``` -If your intuition tells you "this is an int variable x initialized to a default value," you've fallen into the trap. According to C's syntax rules, `int x()` is parsed as a function declaration—a function named `x` that takes no arguments and returns `int`. This ambiguity is even more severe in C++: - -```cpp -class Timer { -public: - Timer() {} -}; +If your intuition says "this is an int variable x initialized to a default value," you've fallen into the trap. According to C's grammar rules, `int x()` is parsed as a function declaration—a function named `x` that takes no arguments and returns `int`. In C++, this ambiguity is even more severe: -Timer t(); // 函数声明!返回 Timer,不接受参数 - // 而不是 Timer 类型的变量 t +```c +// C++ +class TimeKeeper { /* ... */ }; +TimeKeeper time_keeper(); ``` -Later, if you write `t.something()`, the compiler will look at you blankly and say "t is a function and cannot be used this way." +Later, if you write `time_keeper.get_time()`, the compiler will look at you blankly and say "time_keeper is a function, you can't use it that way." -### Function Pointer Declarations — Simplifying with typedef +### Function Pointer Declarations — Simplify with `typedef` -The syntax for declaring function pointers in C is notoriously hard to read. Let's look at the actual declaration of the `signal` function: +C's function pointer declaration syntax is notoriously hard to read. Here is the actual declaration of the `signal` function: ```c void (*signal(int sig, void (*func)(int)))(int); ``` -The first time I saw this declaration, my mind went blank: What is this? The structure is: `返回值类型 (*函数名(参数列表))(参数列表)`—because it returns a function pointer, the return type has to "sandwich" the function name in the middle. The readability is essentially zero. The correct approach is to simplify it with `typedef`: +The first time I saw this declaration, my brain only had three words: What is this? The structure is: `void (*(int))(int)`—because the return is a function pointer, the return type has to "sandwich" the function name. Readability is near zero. The correct way is to use `typedef` to simplify: ```c -typedef void (*SignalHandler)(int); -// 现在清楚多了 -SignalHandler signal(int sig, SignalHandler func); +typedef void (*SigHandler)(int); +SigHandler signal(int sig, SigHandler func); ``` ### The Right-Left Rule -There is a classic technique called "The Right-Left Rule" for deciphering complex C declarations. Starting from the variable name, read to the right first; when you hit a parenthesis, turn left; when you hit an open parenthesis, jump out and continue reading right: +There is a classic technique called the "Right-Left Rule" for interpreting complex C declarations. Start from the variable name, read to the right, turn left when you hit a parenthesis, and jump out to continue right when you hit a left parenthesis: ```c -int (*arr)[10]; -// arr → 向左 *(指针)→ 向右 [10](10 元素数组)→ 向左 int -// 结论:指向含 10 个 int 的数组的指针 - -int (*func_array[5])(double); -// func_array → 向右 [5](5 元素数组)→ 向左 *(指针) -// → 向右 (double)(接受 double 的函数)→ 向左 int(返回 int) -// 结论:5 个元素的函数指针数组,每个指向 int(double) 函数 +int (*(*fp)(int))[10]; +// fp is a pointer to a function taking an int argument, +// returning a pointer to an array of 10 ints. ``` > ⚠️ **Pitfall Warning** -> Although the Right-Left Rule can help you decipher complex declarations, please use `typedef` to simplify them in actual coding. Don't write a single-line declaration that takes half a minute to read just to show off—while you might feel like a hotshot writing it today, even you won't be able to understand it three months from now. +> While the Right-Left Rule can help you interpret complex declarations, please try to use `using` (C++) or `typedef` (C) to simplify in actual coding. Don't write a declaration that takes half a minute to read just to show off—you might feel cool today, but even you won't understand it three months later. -## Step 6 — Common Semantic Errors +## Step 6 — Common Errors at the Semantic Level -The previous sections covered syntax-level traps. This section supplements a few classic semantic errors—the compiler won't stop you, but your program will simply be wrong. +Previous sections covered syntactic traps; this section supplements classic errors at the semantic level—the compiler won't stop you, but your program is just wrong. -### Array Out-of-Bounds Access +### Array Out-of-Bounds -C does not perform array bounds checking. This is a design philosophy choice—bounds checking has runtime overhead, and C leaves safety as the programmer's responsibility: +C does not perform array bounds checking. This is a design philosophy choice—bounds checking has runtime overhead, and C leaves safety to the programmer's responsibility: ```c -int arr[5] = {1, 2, 3, 4, 5}; -for (int i = 0; i <= 5; i++) { // i=5 时越界! - printf("%d\n", arr[i]); -} +int arr[5]; +arr[5] = 42; // Out of bounds! ``` -`arr` has five elements, with valid indices ranging from 0 to 4. When `i = 5`, `arr[5]` accesses memory beyond the array—reading is undefined, and writing is even more dangerous, potentially overwriting other variables, corrupting the stack frame, causing a segfault, or even becoming a security vulnerability (the fundamental principle of buffer overflow attacks is intentionally writing out of bounds). +`arr` has 5 elements, with indices ranging from 0 to 4. When `i == 5`, `arr[i]` accesses memory past the array—reading is undefined, and writing is more dangerous, potentially overwriting other variables, corrupting stack frames, causing segfaults, or even becoming a security vulnerability (buffer overflow attacks are based on intentional out-of-bounds writing). -```c -// 正确写法:用 sizeof 计算数组大小,即使改了长度也能自动适配 -int arr[] = {1, 2, 3, 4, 5}; -int len = sizeof(arr) / sizeof(arr[0]); -for (int i = 0; i < len; i++) { - printf("%d\n", arr[i]); -} +```text +warning: array subscript 5 is above array bounds of 'int [5]' ``` ### Uninitialized Variables -Local variables in C are not automatically initialized to zero—their initial values are whatever garbage was left on the stack, which can be different every time the program runs: +Local variables in C are not automatically initialized to zero—their initial value is whatever garbage value was left in that stack memory, potentially different every run: ```c -int count; // 未初始化 -if (some_condition) { - count = 0; +int sum; +for (int i = 0; i < 10; i++) { + sum += i; // UB: sum is uninitialized! } -// 如果 some_condition 为假,count 是垃圾值 -printf("count = %d\n", count); ``` -This type of bug might work fine in debug mode (where stack memory is zeroed out) but fail in release mode (where stack memory is dirty)—you might not even be able to catch it during development. The correct approach is simple: **initialize at declaration**, like `int count = 0;`. +This bug might work in debug mode (stack memory zeroed) but fail in release mode (stack memory is dirty)—you might not even detect it during development. The correct way is simple: **Initialize when declaring**, `int sum = 0;`. ### Integer Overflow -Overflow of unsigned integers is well-defined (modulo arithmetic), but overflow of signed integers is undefined behavior—the compiler can legitimately assume "signed integers never overflow," thereby optimizing away your overflow checks: +Overflow of unsigned integers is well-defined (modulo arithmetic), but overflow of signed integers is undefined behavior—the compiler can legally assume "signed integers never overflow," thereby optimizing away your overflow checks: ```c -int a = 2000000000; -int b = 2000000000; -if (a + b < 0) { // 编译器可能直接删除这个判断! - printf("Overflow detected!\n"); +int a = 100000, b = 100000; +if (a + b < 0) { // Check for overflow + printf("Overflow!\n"); } ``` -That's right—the compiler might simply delete this if statement during the optimization phase because it "knows" signed addition doesn't overflow (according to the C standard, if it overflows, it's UB, and the compiler can assume UB doesn't happen). +Yes, the compiler might simply delete this `if` check during optimization because it "knows" signed addition won't overflow (according to the C standard, if it overflows it's UB, and the compiler can assume UB doesn't happen). -```c -// 正确的溢出检查:在加法之前检查操作数 -#include -if (a > INT_MAX - b) { - printf("Overflow!\n"); -} +```text +warning: assuming signed overflow does not occur ``` > ⚠️ **Pitfall Warning** -> Never use "the result is negative" to detect signed integer overflow—once overflow occurs, all assumptions about the result are unreliable. The correct approach is to check the operands before the operation, such as `a > INT_MAX - b`. +> Never use "result is negative" to detect signed integer overflow—after overflow, all assumptions about the result are unreliable. The correct way is to check operands before the operation, e.g., `if (a > INT_MAX - b)`. ### Unterminated Strings -Strings in C end with a `\0` (null byte). Forgetting this terminator is a classic beginner mistake: +C strings end with a `\0` (null byte). Forgetting this terminator is a classic beginner mistake: ```c -char greeting[5] = {'H', 'e', 'l', 'l', 'o'}; -// 没有 '\0' 终止符! -printf("%s\n", greeting); // 未定义行为 +char str[3]; +str[0] = 'a'; +str[1] = 'b'; +str[2] = 'c'; +printf("%s", str); // UB: No null terminator! ``` -`%s` in `printf` will keep reading until it encounters a `\0`. If the memory after `greeting` happens to be zero, you might get lucky and have no issues; if it isn't, printf will output a bunch of garbage characters or even segfault. +`printf`'s `%s` will keep reading until it hits a `\0`. If the memory after `str` happens to be zero, you might get lucky; if not, printf will output a bunch of garbage characters or even segfault. -```c -// 正确写法 -char greeting[6] = {'H', 'e', 'l', 'l', 'o', '\0'}; // 手动终止 -char greeting[] = "Hello"; // 字符串字面量自动添加 '\0',大小为 6 +```text +warning: 'printf' argument 3 is a pointer to uninitialized data ``` -There is also a classic off-by-one error: forgetting to leave space for the `\0` when `malloc` allocates a string buffer: +Another classic off-by-one: forgetting to leave space for `\0` when allocating string buffers: ```c -char* result = malloc(strlen(s) + strlen(t)); // BUG!少了 +1 -char* result = malloc(strlen(s) + strlen(t) + 1); // OK,+1 给 '\0' +char *src = "hello"; +char *dst = (char*)malloc(strlen(src)); // Wrong! +strcpy(dst, src); // Buffer overflow! ``` -`strlen` returns the string length (excluding the `\0`), and both `strcpy` and `strcat` copy the terminator, so the buffer needs `strlen(s) + strlen(t) + 1` bytes. +`strlen` returns the string length (excluding `\0`), while `strcpy` and `sprintf` copy the terminator, so the buffer needs `strlen + 1` bytes. -## Bridging to C++ +## C++ Connections -You'll find that every "new feature" in C++ wasn't invented out of thin air—they are the culmination of decades of practical experience with C, representing engineering solutions targeting real bug patterns. Only by understanding C's traps can you truly appreciate why C++ was designed the way it was. The table below summarizes the key features C++ introduced to mitigate these traps: +You will find that every "new feature" in C++ was not invented out of thin air—they are the summary of decades of practical experience in C, and engineering solutions targeting real bug patterns. Understanding C's traps helps you truly understand why C++ is designed this way. The table below summarizes the key features introduced by C++ to mitigate these traps: -| Trap Category | The Problem in C | C++ Mitigation | -|---------|-----------|-------------| -| Maximal Munch | `/*` parsed as start of comment | More aggressive compiler warnings, templates replacing macros | -| Operator Precedence | Bitwise ops lower than comparisons, `*p++` ambiguity | `constexpr` compile-time validation, `std::byte` type-safe bit operations | -| `=` vs `==` | Assignment in conditions not flagged | `-Wall` warning, `[[nodiscard]]`, C++17 init-statement | -| Semicolon Issues | Empty body not flagged | `-Wempty-body` warning, `[[fallthrough]]` explicit intent marker | -| Declaration Ambiguity | Function declaration vs. variable initialization | Brace initialization `T{}`, `auto` type deduction, `using` replacing `typedef` | -| Array Out-of-Bounds | No bounds checking | `std::array::at()`, `std::vector::at()`, `std::span` | -| Uninitialized Variables | Local variables contain garbage values | Constructor initializer lists, in-class initializers | -| Integer Overflow | Signed overflow is UB | `std::add_sat()` (C++20), `constexpr` compile-time detection | -| Unterminated Strings | Manual management of `\0` | `std::string` automatic management, `std::string_view` safe views | +| Trap Category | Problem in C | C++ Mitigation | +|---------------|--------------|----------------| +| Greedy Matching | `/*` parsed as comment start | More aggressive compiler warnings, templates replacing macros | +| Operator Precedence | Bitwise lower than comparison, `=` vs `==` ambiguity | `constexpr` compile-time validation, `bitset` type-safe bitwise ops | +| `=` vs `==` | Assignment in condition not an error | `-Wparentheses` warning, `[[maybe_unused]]`, C++17 init-statement | +| Semicolon Issues | Empty body not an error | `-Wempty-body` warning, `[[likely]]`/`[[unlikely]]` explicit intent markers | +| Declaration Ambiguity | Function declaration vs variable init | Brace initialization `{}`, `auto` type deduction, `using` replacing `typedef` | +| Array Out-of-Bounds | No bounds checking | `std::vector`, `std::array`, `std::span` | +| Uninitialized Variables | Locals contain garbage | Constructor initializer lists, in-class initializers | +| Integer Overflow | Signed overflow is UB | `std::add_overflow` (C++20), `constexpr` compile-time detection | +| Unterminated Strings | Manual `\0` management | `std::string` automatic management, `std::string_view` safe view | -A few key C++ improvements are worth highlighting specifically. Brace initialization (`Timer t{}`) eliminates the ambiguity of the "most vexing parse." The `auto` keyword drastically reduces the need to manually write complex types. `std::string` fundamentally eliminates all traps associated with manual string management (memory allocation, terminators, buffer overflows). C++17's init-statement in if/switch (`if (auto it = map.find(key); it != map.end())`) allows you to perform assignments inside conditions while keeping the variable's scope limited to the if/else block. C++11's `using` alias is also more intuitive than `typedef`: `using SignalHandler = void (*)(int)` can be understood at a glance, whereas `typedef void (*SignalHandler)(int)` takes a moment to process. +Several key C++ improvements are worth special mention. Brace initialization (`{}`) eliminates the ambiguity of "Most Vexing Parse." The `auto` keyword drastically reduces the need for hand-writing complex types. `std::string` fundamentally eliminates all traps of manual string management (memory allocation, terminators, buffer overflow). C++17's init-statement in if/switch (`if (auto x = get(); x > 0)`) allows assignment in the condition while limiting variable scope to the if/else block. C++11's `using` alias is also more intuitive than `typedef`: `using SigHandler = void(int)` is clear at a glance, whereas `typedef void (*SigHandler)(int)` takes a moment to process. -## Exercises +## Practice Exercises -Below are a few exercises. The code intentionally contains traps—please find and fix them. +Here are a few practice problems. The code intentionally contains traps; please find and fix them. ```c -/// @brief 练习 1:修复词法分析陷阱 -/// 下面的代码本意是计算 a / b 的值,但编译器不这么认为 -/// 提示:思考贪婪匹配会把 /* 解析成什么 -/// @param a 被除数 -/// @param b 除数的指针 -/// @return a / (*b) -int fix_lexical_trap(int a, int* b) -{ - // TODO: 修复代码中的陷阱 - return a/*b; -} +// Exercise 1: Fix the greedy matching issue +int x = 5; +int y = x---x; ``` ```c -/// @brief 练习 2:修复优先级陷阱 -/// 下面的代码本意是检查 flags 的低 4 位是否全部为零 -/// 提示:位运算 AND 的优先级低于 == -/// @param flags 待检查的标志位 -/// @return 1 表示低 4 位全为零,0 表示至少有一位非零 -int fix_priority_trap(unsigned int flags) -{ - // TODO: 修复代码中的陷阱 - return flags & 0x0F == 0; +// Exercise 2: Fix the operator precedence +#define MASK 0x01 +if (MASK & 0x10 == 0) { + printf("Bit not set\n"); } ``` ```c -/// @brief 练习 3:修复赋值与比较陷阱 -/// 下面的代码本意是检查 x 是否等于目标值 -/// 提示:if 条件中的 = 和 == 是不同的 -/// @param x 当前值 -/// @param target 目标值 -/// @return 1 表示相等,0 表示不等 -int fix_assignment_trap(int x, int target) -{ - // TODO: 修复代码中的陷阱 - if (x = target) - return 1; - return 0; +// Exercise 3: Fix the assignment vs comparison +int status = -1; +if (status = ERR_SUCCESS) { + printf("Success\n"); } ``` ```c -/// @brief 练习 4:修复分号陷阱 -/// 下面的函数本意是找到数组中的最大值 -/// 提示:检查 if 后面是否有多余的分号 -/// @param arr 整数数组 -/// @param n 数组长度 -/// @return 数组中的最大值 -int fix_semicolon_trap(int* arr, int n) +// Exercise 4: Fix the semicolon trap +int i = 0; +while (i < 10); { - // TODO: 修复代码中的陷阱 - int max_val = arr[0]; - for (int i = 1; i < n; i++) - if (arr[i] > max_val); - max_val = arr[i]; - return max_val; + printf("%d\n", i); + i++; } ``` ```c -/// @brief 练习 5:修复整数溢出检查 -/// 下面的代码试图检测 a + b 是否溢出 -/// 提示:溢出后结果是未定义的,不能依赖结果来判断是否溢出 -/// @param a 第一个加数(正数) -/// @param b 第二个加数(正数) -/// @return 1 表示会溢出,0 表示安全 -int fix_overflow_check(int a, int b) -{ - // TODO: 修复代码中的陷阱 - if (a + b < 0) - return 1; - return 0; +// Exercise 5: Fix the array bounds +int data[4]; +for (int i = 0; i <= 4; i++) { + data[i] = i; } ``` ```c -/// @brief 练习 6:综合挑战——修复字符串拼接函数 -/// 下面的函数本意是将两个字符串拼接后返回新字符串 -/// 提示:注意内存分配大小、字符串终止符、空指针检查 -/// @param s 第一个字符串 -/// @param t 第二个字符串 -/// @return 新分配的拼接字符串,调用者负责释放 -char* fix_string_concat(const char* s, const char* t) -{ - // TODO: 修复代码中的所有陷阱 - char* result = malloc(strlen(s) + strlen(t)); - strcpy(result, s); - strcat(result, t); - return result; -} +// Exercise 6: Fix the string termination +char buf[5]; +strcpy(buf, "hello"); ``` ## References diff --git a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md index cd5abc494..dc169d605 100644 --- a/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md +++ b/documents/en/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md @@ -1,8 +1,8 @@ --- -title: Hand-Rolling a Dynamic Array — Implementing a Container from Scratch -description: Design and implement a type-safe dynamic array library from scratch, - understand memory resizing strategies, error handling patterns, and API design principles, - and pave the way to understanding `std::vector`. +title: Implementing a Dynamic Vector from Scratch +description: Design and implement a type-safe dynamic array library from scratch. + We will explore memory expansion and contraction strategies, error handling patterns, + and API design principles, paving the way for a deeper understanding of `std::vector`. chapter: 1 order: 105 tags: @@ -26,430 +26,417 @@ prerequisites: - C 语言陷阱与常见错误 translation: source: documents/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md - source_hash: 6084d869ef9de53c27f62e52bddc1968e6ca3d8af3b9a94ddc37d81961b38ab0 - translated_at: '2026-05-26T10:38:22.846585+00:00' + source_hash: 1601bf7a93a6e966bb07cd6fc3f6d1d9cc65ac292dfca121f7e3be43be984600 + translated_at: '2026-06-13T11:44:43.115178+00:00' engine: anthropic - token_count: 3971 + token_count: 3969 --- -# Building a Dynamic Array from Scratch — Implementing a Container from Zero +# Hand-Rolling a Dynamic Array — Implementing a Container from Scratch -One of the most painful things about writing C programs is that array sizes must be determined at compile time. If you want to store 10 items, you declare `int arr[10]`; later, when requirements change to 100, you have to go back, modify the code, and recompile. What's worse, in many cases you have no idea at all how many items will arrive at runtime — how many records a user enters, how many packets the network receives, how many samples a sensor collects — these can only be determined at runtime. +When writing C programs, one of the most painful aspects is that array sizes must be determined at compile time. You want to store 10 items, you declare `int arr[10]`. Later, requirements change and you need to store 100, so you go back to modify the code and recompile. Even worse, in many cases, you simply don't know how many items will be queued at runtime—how many records the user inputs, how many packets the network receives, how many samples the sensor collects. These are all runtime quantities. -`malloc` does solve the uncertain-size problem, but it only handles allocation, not growth — once it's full and you want to keep adding, you have to manually `realloc`, manage capacity yourself, and handle errors on your own. `malloc` and `realloc` calls scattered throughout your codebase quickly turn into a maintenance nightmare. In Python you can casually write `lst.append(x)`, and in C++ you have `std::vector::push_back` — they both grow automatically. But the C standard library has no such thing, so we have to build it ourselves. +`malloc` does solve the uncertainty of size, but it only handles allocation, not growth. If it gets full and you want to add more, you have to manually `realloc`, manage capacity yourself, and handle errors on your own. Scattered `malloc` and `realloc` calls throughout the code quickly become a maintenance nightmare. In Python, you can just write `list.append()`, and in C++, you have `std::vector`—they both handle resizing automatically. But the C standard library lacks such a utility, so we must build it ourselves. -Today we'll build a complete dynamic array library from scratch. Along the way, we'll clarify data structure design, memory growth and shrinkage strategies, and error handling patterns. Finally, we'll compare our work against C++'s `std::vector` to see how the standard library handles these same problems. +Today, starting from scratch, we will hand-roll a complete dynamic array library. In this process, we will clarify data structure design, memory expansion and shrinking strategies, and error handling patterns. Finally, we will compare this with C++'s `std::vector` to see how the standard library handles these things. > **Learning Objectives** > -> - [ ] Understand the necessity of the size/capacity/data three-field design in dynamic arrays -> - [ ] Master the 2x growth strategy and its amortized O(1) complexity analysis -> - [ ] Understand shrinkage timing choices to avoid frequent `realloc` calls -> - [ ] Master the enum return code error handling pattern -> - [ ] Be able to independently design a complete CRUD API -> - [ ] Understand the internal mechanisms of `std::vector` and their correspondence to our hand-rolled C version +> - [ ] Understand the necessity of the size/capacity/data three-field design for dynamic arrays. +> - [ ] Master the 2x expansion strategy and its amortized O(1) complexity analysis. +> - [ ] Understand the timing of shrinking to avoid frequent `realloc`. +> - [ ] Master the error handling pattern using enum return codes. +> - [ ] Be able to independently design a complete CRUD API. +> - [ ] Understand the internal mechanism of `std::vector` and its correspondence with the hand-rolled C version. -## Environment Notes +## Environment Setup -All code examples in this article compile and run in a standard C environment. When compiling, we recommend always including `-Wall -Wextra` — dynamic array implementation involves extensive pointer arithmetic and `malloc`/`realloc` calls, and compiler warnings can help you catch many potential issues. +All code examples in this article are compiled and run in a standard C environment. It is recommended to always compile with `-Wall -Wextra`—implementing a dynamic array involves extensive pointer arithmetic and `malloc` calls, and compiler warnings can help you catch many potential issues. -```text -gcc -Wall -Wextra -std=c11 -o dynarray dynarray.c +```bash +gcc main.c dynamic_array.c -Wall -Wextra -O2 -o dynamic_array_demo ``` ## Step 1 — Figure Out What a Dynamic Array Actually Is -From a physical storage perspective, a dynamic array is essentially still a contiguous block of memory, no different from a plain array. The key difference is that a dynamic array separates "used space" from "reserved space" and uses a pointer to access this memory indirectly, so it can swap for a larger block when needed. You can think of it as a warehouse that can automatically "move to a bigger building" — when the shelves are full, you switch to a warehouse with more shelves, move all the old goods over, and to the outside world the address changed but the interface for storing and retrieving goods remains the same. +From a physical storage perspective, a dynamic array is essentially still a contiguous block of memory, no different from a standard array. The key difference is that a dynamic array separates "used space" from "reserved space" and uses a pointer to access this memory indirectly. This allows it to swap for a larger block when needed. You can imagine it as a warehouse that can automatically "move to a bigger house"—when the shelves are full, you swap to a warehouse with more shelves, move the old goods over, and to the outside world, the address changed but the interface for storing and retrieving goods remains the same. -Let's start with the simplest prototype: +Let's start with a simplest prototype: ```c -typedef struct { - void *data; - int size; -} DynArray; +struct DynamicArray { + void* data; // Pointer to the heap memory + size_t size; // Number of elements currently stored +}; ``` -`data` points to contiguous memory allocated on the heap, and `size` records the current number of elements. But you'll notice a fatal problem: we're using `void *`, so we don't know how large each element is. For an `int` array the stride is 4 bytes, for `double` it's 8 bytes, and for a custom struct it could be dozens of bytes. Without element size information, we simply cannot locate the Nth element. +`data` points to contiguous memory allocated on the heap, and `size` records the current number of elements. But you will notice a fatal problem: we use `void*`, so we don't know how large each element is. For an `int` array, the stride is 4 bytes; for `double`, it's 8 bytes; a custom struct might be tens of bytes. Without element size information, we cannot locate the Nth element at all. -So we need to add `elem_size` and `capacity`: +Therefore, we need to add `elem_size` and `capacity`: ```c -typedef struct { - void *data; // 指向堆内存的指针 - int size; // 当前元素个数 - int capacity; // 总容量(最多能放多少个元素) - int elem_size; // 每个元素的字节大小 -} DynArray; +struct DynamicArray { + void* data; // Pointer to the heap memory + size_t size; // Number of elements currently stored + size_t capacity; // Total number of elements that can be stored + size_t elem_size;// Size of a single element in bytes +}; ``` -The four fields each have their own role: `data` manages "where it exists", `size` manages "how many are used", `capacity` manages "how many slots there are in total", and `elem_size` manages "how large each slot is". With `elem_size`, locating the address of the `i`th element is simply `(char *)arr->data + i * arr->elem_size` — we must first cast to `char *` because `sizeof(char)` is exactly 1 byte, making the pointer arithmetic a precise byte offset. Doing addition directly on `void *` will cause a compiler error (the C standard doesn't allow it, although GCC permits it as an extension, but it's not portable). +The four fields each have their role: `data` manages "where it exists", `size` manages "how many are used", `capacity` manages "how many slots are there in total", and `elem_size` manages "how big each slot is". With `elem_size`, locating the address of the `i`-th element is `(char*)data + i * elem_size`—we must cast to `char*` first, because `sizeof(char)` is guaranteed to be 1 byte, ensuring pointer arithmetic results in precise byte offsets. Doing addition directly on `void*` will cause a compiler error (not allowed by the C standard; although GCC allows it as an extension, it is not portable). > ⚠️ **Pitfall Warning** -> `size` is "how many valid elements actually exist", `capacity` is "how many elements this memory block can hold at most", they are not the same thing. If you use `capacity` instead of `size` as the upper bound when iterating, you'll read uninitialized garbage data. +> `size` is "how many valid elements there actually are", `capacity` is "how many elements this memory block can hold at most", `size <= capacity`. If you use `capacity` instead of `size` as the upper bound during traversal, you will read uninitialized garbage data. -The internal data layout of `std::vector` is almost identical to ours, except that the template parameter `T` replaces the `void *` + `elem_size` combination, and type safety is guaranteed at compile time. `std::vector` is 24 bytes in most implementations — three 8-byte fields (pointer + size + capacity), and `elem_size` doesn't need to be stored after template instantiation. +The internal data layout of `std::vector` is almost identical to ours, except that the template parameter `T` replaces the combination of `elem_size` + `void*`, ensuring type safety is guaranteed at compile time. `std::vector` is 24 bytes in most implementations—three 8-byte fields (pointer + size + capacity)—`elem_size` is not needed after template instantiation. ## Step 2 — Establish an Error Handling System -Before writing functional code, let's solve an engineering problem: what do we do when a function fails? The laziest approach is to `abort()` on error — this is common in teaching code, but it's an absolute disaster in real engineering. You can't just kill the entire server process because a single `malloc` failed, can you? +Before writing functional logic, let's solve an engineering problem: what to do when a function fails? The laziest approach is to `exit(1)` immediately upon error—this is common in teaching code, but in actual engineering, it's a disaster. You can't just kill the entire server process because one `malloc` failed, right? We use an enum to establish a clear error code system: ```c typedef enum { - DYN_OK = 0, - DYN_ERR_ALLOC, - DYN_ERR_OUT_OF_RANGE, - DYN_ERR_NULL_PTR, - DYN_ERR_INVALID_SIZE -} DynResult; + ARR_OK, // Success + ARR_ERR_MALLOC, // Memory allocation failed + ARR_ERR_OUT_OF_BOUNDS, // Index out of bounds + ARR_ERR_INVALID_ARG, // Invalid argument (e.g., NULL pointer) + ARR_ERR_NOT_FOUND // Element not found +} ArrayResult; ``` -Every function returns a `DynResult`, and the caller can check whether the operation succeeded and why it failed. We can pair this with a helper macro to output friendly error messages: +Each function returns `ArrayResult`, allowing the caller to judge whether the operation succeeded and the reason for failure. Combined with helper macros, we can output friendly error messages: ```c -#define DYN_CHECK(expr) do { \ - DynResult _r = (expr); \ - if (_r != DYN_OK) { \ - fprintf(stderr, "Error %d at %s:%d\n", _r, __FILE__, __LINE__); \ - return _r; \ - } \ -} while(0) +#define CHECK_RESULT(call) \ + do { \ + ArrayResult res = (call); \ + if (res != ARR_OK) { \ + fprintf(stderr, "Error at %s:%d: %s\n", \ + __FILE__, __LINE__, #call); \ + exit(1); \ + } \ + } while (0) ``` -Separating error message display from error code generation is an even better approach — the caller might want to write errors to a log file instead of printing to the terminal, or might want to clean up resources after an error. Enum return codes give the caller complete control. +Separating the display of error messages from the generation of error codes is a better practice—the caller might want to log errors to a file rather than print to the terminal, or might want to clean up resources after an error. Enum return codes give the caller full control. ## Step 3 — Implement Creation and Destruction ### Creation — Factory Function -In object-oriented languages this is called a constructor; in C we call it a factory function — it "produces" an initialized object and returns it to the caller. +In object-oriented languages, this is called a constructor; in C, we call it a factory function—it "produces" an initialized object and returns it to the caller. ```c -DynResult dynarray_create(DynArray *arr, int elem_size, int init_capacity) { - if (!arr || elem_size <= 0) return DYN_ERR_NULL_PTR; +ArrayResult array_create(struct DynamicArray* arr, size_t elem_size, size_t initial_capacity) { + if (!arr || elem_size == 0) return ARR_ERR_INVALID_ARG; - if (init_capacity < 8) init_capacity = 8; // 最小容量保底 + // Enforce a minimum capacity to avoid frequent resizing + if (initial_capacity < 8) initial_capacity = 8; - arr->data = malloc((size_t)init_capacity * elem_size); - if (!arr->data) return DYN_ERR_ALLOC; + arr->data = malloc(initial_capacity * elem_size); + if (!arr->data) return ARR_ERR_MALLOC; arr->size = 0; - arr->capacity = init_capacity; + arr->capacity = initial_capacity; arr->elem_size = elem_size; - return DYN_OK; + return ARR_OK; } ``` -After allocating the struct memory, you must immediately check the `malloc` return value — accessing `arr->data` without checking will cause an immediate segfault. We set a minimum capacity of 8 as a rule of thumb; too small leads to frequent growth, too large wastes memory. +After allocating the structure's memory, you must immediately check the `malloc` return value—accessing `arr->data` without checking will cause an immediate segmentation fault. We set a minimum capacity of 8 as a rule of thumb; too small leads to frequent resizing, too large wastes memory. > ⚠️ **Pitfall Warning** -> Note the existence of the error path. This is a very classic resource leak scenario: the struct allocation succeeded, but the data area allocation failed. If you simply `return` without `free`ing, that struct memory is leaked forever. This situation of "partially allocating resources but subsequent steps failing" is the most error-prone part of C memory management. +> Note the existence of `arr->data = malloc(...)`. This is a classic resource leak scenario: the struct allocation succeeded, but the data area allocation failed. If you simply `return ARR_ERR_MALLOC` without `free(arr)`, that struct memory is leaked forever. This situation of "allocating some resources but failing subsequent steps" is one of the most error-prone areas in C memory management. Usage: ```c -DynArray arr; -DYN_CHECK(dynarray_create(&arr, sizeof(int), 4)); +struct DynamicArray my_arr; +if (array_create(&my_arr, sizeof(int), 10) != ARR_OK) { + // Handle error +} ``` -Use `sizeof(int)` instead of hardcoding `4` — the size of `int` may differ across platforms, and `sizeof` is calculated at compile time with no runtime overhead. +Use `sizeof(int)` instead of hardcoding `4`—the size of `int` might vary on different platforms, while `sizeof` is calculated at compile time with zero runtime overhead. ### Destruction — Release Order Must Not Be Reversed ```c -void dynarray_destroy(DynArray *arr) { - if (!arr) return; - free(arr->data); - arr->data = NULL; - arr->size = 0; - arr->capacity = 0; +void array_destroy(struct DynamicArray* arr) { + if (arr) { + free(arr->data); // 1. Release the data block + arr->data = NULL; + arr->size = 0; + arr->capacity = 0; + } } ``` -The release order must not be reversed — if you `free(arr)` first, accessing `arr->data` is a use-after-free. Another issue is that after `free(arr->data)`, the `arr->data` pointer itself doesn't become `NULL`; it still points to that freed memory. C function parameters are pass-by-value, so we can only rely on the caller to manually set it to NULL: +The release order cannot be reversed—if you `free(arr)` first, accessing `arr->data` becomes a Use After Free. Another issue is that after `free(arr->data)`, the `arr->data` pointer itself doesn't automatically become `NULL`; it still points to that freed memory. C function arguments are passed by value, so we rely on the caller to manually set it to NULL: ```c -dynarray_destroy(&arr); -// arr.data 已经被函数内部置为 NULL +array_destroy(&my_arr); +my_arr.data = NULL; // Caller must do this manually ``` -C++'s RAII mechanism solidifies this create/destroy pairing at the language level — the destructor is automatically called when the object leaves scope, so memory absolutely cannot leak. In our C version, every step of resource management relies on human discipline. +C++'s RAII mechanism solidifies this create/destroy pairing at the language level—the destructor is called automatically when the object leaves scope, absolutely guaranteeing no memory leaks. In our C version, every step of resource management relies on human discipline. -## Step 4 — Get Capacity Management Right +## Step 4 — Master Capacity Management -### Growth — The 2x Strategy +### Expansion — 2x Growth Strategy -When `size == capacity` the array is full, and inserting another element requires growth. The question is: how much to grow? If you add 1 each time, inserting N elements consecutively requires N `realloc` calls, and the total copy volume is 1 + 2 + ... + N = O(N²), which is completely unacceptable. Doubling — doubling the capacity each time it fills — requires only about log₂(N) growths, with a total copy volume of ≈ 2N = O(N), amortized to O(1) per insertion. It's like moving houses — instead of buying one more box each time, you double the house area each time — the move itself is exhausting, but averaged over each day, you barely feel it. +When `size == capacity`, the array is full, and inserting requires expansion. The question is: how much to expand? If we add 1 each time, inserting N elements continuously requires N `realloc`s, and the total copy amount is 1 + 2 + ... + N = O(N²), which is completely unacceptable. Doubling expansion—doubling the capacity whenever full—requires only about log₂(N) expansions, with a total copy amount ≈ 2N = O(N), which amortizes to O(1) per insertion. It's like moving house: instead of buying one more box each time, you double the floor area of the house—the move itself is tiring, but averaged over every day, it's negligible. ```c -static DynResult dynarray_grow(DynArray *arr) { - int new_cap = arr->capacity * 2; - void *new_data = realloc(arr->data, (size_t)new_cap * arr->elem_size); - if (!new_data) return DYN_ERR_ALLOC; +static ArrayResult array_reserve(struct DynamicArray* arr, size_t new_capacity) { + if (new_capacity < arr->size) return ARR_ERR_INVALID_ARG; // Cannot discard data + + void* new_data = realloc(arr->data, new_capacity * arr->elem_size); + if (!new_data) return ARR_ERR_MALLOC; arr->data = new_data; - arr->capacity = new_cap; - return DYN_OK; + arr->capacity = new_capacity; + return ARR_OK; } ``` -`realloc` tries to expand in place at the original location; if that's not possible, it finds a larger block on the heap and copies the old data over. In either case, the returned pointer points to valid memory and the old data is intact. +`realloc` attempts to expand in-place at the original location; if that's not possible, it finds a larger block on the heap and copies the old data over. In either case, the returned pointer points to valid memory, and the old data remains intact. > ⚠️ **Pitfall Warning** -> `realloc` may return a different address! You must use the return value to update the pointer. If you write `realloc(arr->data, new_size)` without receiving the return value, you lose the new address after the move, and the memory pointed to by the old address has already been freed — a double disaster. +> `realloc` might return a different address! You must use the return value to update the pointer. If you write `realloc(arr->data, ...)` and don't receive the return value, you lose the new address after moving, and the old address points to freed memory—a double disaster. -### Shrinkage — Avoiding Thrashing +### Shrinking — Avoid Thrashing -If an array once grew to 10,000 elements and later shrank to only 10, the memory for 9,990 elements is wasted for nothing. But shrinkage timing is much trickier than growth — consider an array oscillating between 100 and 50: shrinking at 50, then immediately needing to insert again, growing back to 100 — this back-and-forth is the classic "thrashing" problem. Our strategy is to shrink to `size * 2` but keep a minimum capacity of 8, called explicitly by the user: +If an array grew to 10,000 elements and later shrank to just 10, the memory for 9,990 elements is wasted. However, the timing for shrinking is much more nuanced than expansion—consider an array oscillating between 100 and 50: shrinking to 50 triggers a shrink, immediately followed by an insertion, expanding back to 100—this back-and-forth is the classic "thrashing" problem. Our strategy is to shrink to `size` but keep a minimum capacity of 8, called explicitly by the user: ```c -DynResult dynarray_shrink_to_fit(DynArray *arr) { - if (!arr) return DYN_ERR_NULL_PTR; +ArrayResult array_shrink_to_fit(struct DynamicArray* arr) { if (arr->size == 0) { + // If empty, free memory and keep a small buffer free(arr->data); - arr->data = NULL; - arr->capacity = 0; - return DYN_OK; + arr->data = malloc(8 * arr->elem_size); // Keep minimal capacity + arr->capacity = 8; + return ARR_OK; } - - int new_cap = arr->size < 8 ? 8 : arr->size * 2; - if (new_cap >= arr->capacity) return DYN_OK; // 不需要缩 - - void *new_data = realloc(arr->data, (size_t)new_cap * arr->elem_size); - if (!new_data) return DYN_ERR_ALLOC; - - arr->data = new_data; - arr->capacity = new_cap; - return DYN_OK; + return array_reserve(arr, arr->size); } ``` -`shrink_to_fit` is typically only called when you're "certain there won't be significant further growth", such as after data loading is complete. The C++ standard doesn't mandate that `std::vector`'s growth factor must be 2x — MSVC uses 1.5x, while libstdc++ and libc++ use 2x. 1.5x has higher memory utilization, but slightly more growth operations. +`shrink_to_fit` is usually called only when "it's certain there won't be major growth," such as after data loading is complete. The C++ standard does not mandate that `std::vector`'s expansion factor must be 2x—MSVC uses 1.5x, while libstdc++ and libc++ use 2x. 1.5x has higher memory utilization but slightly more expansions. ## Step 5 — Implement Element Access -We provide two access methods: a fast version without bounds checking (similar to `operator[]`) and a safe version with bounds checking (similar to `at()`). +We provide two access methods: a fast version without bounds checking (like `operator[]`) and a safe version with bounds checking (like `at()`). ```c -void *dynarray_at(DynArray *arr, int index) { - if (!arr || index < 0 || index >= arr->size) return NULL; - return (char *)arr->data + index * arr->elem_size; +// Fast access (no bounds check) +void* array_at_unsafe(const struct DynamicArray* arr, size_t index) { + return (char*)arr->data + index * arr->elem_size; } -void *dynarray_unchecked(DynArray *arr, int index) { - return (char *)arr->data + index * arr->elem_size; +// Safe access (with bounds check) +ArrayResult array_at(const struct DynamicArray* arr, size_t index, void* out_buffer) { + if (index >= arr->size) return ARR_ERR_OUT_OF_BOUNDS; + memcpy(out_buffer, (char*)arr->data + index * arr->elem_size, arr->elem_size); + return ARR_OK; } ``` -The safe version returns a copy into the caller's buffer, because C has no concept of references and the data area is `void *`, so the function can't directly return a value of the correct type. This is indeed much more cumbersome than C++'s `T& at(size_t)`, but that's the cost of generic programming in C. +The safe version returns by copying to the caller's buffer because C lacks the concept of references and the data area is `void*`, so the function cannot directly return a value of the correct type. This is indeed more cumbersome than C++'s `operator[]`, but it is the cost of generic programming in C. ```c -int val; -void *ptr = dynarray_at(&arr, 3); -if (ptr) { - memcpy(&val, ptr, sizeof(int)); - printf("arr[3] = %d\n", val); +int value; +if (array_at(&my_arr, 5, &value) == ARR_OK) { + printf("Element at index 5: %d\n", value); } ``` -## Step 6 — Implement Insertion and Deletion +## Step 6 — Implement Add and Remove Operations -### push_back — Append to the End +### push_back — Append to Tail ```c -DynResult dynarray_push_back(DynArray *arr, const void *elem) { - if (!arr || !elem) return DYN_ERR_NULL_PTR; - +ArrayResult array_push_back(struct DynamicArray* arr, const void* value) { if (arr->size == arr->capacity) { - DynResult r = dynarray_grow(arr); - if (r != DYN_OK) return r; + ArrayResult res = array_reserve(arr, arr->capacity * 2); + if (res != ARR_OK) return res; } - void *dest = (char *)arr->data + arr->size * arr->elem_size; - memcpy(dest, elem, arr->elem_size); + void* target = (char*)arr->data + arr->size * arr->elem_size; + memcpy(target, value, arr->elem_size); arr->size++; - return DYN_OK; + return ARR_OK; } ``` -The destination of `memcpy` is `(char *)arr->data + arr->size * arr->elem_size` — skipping all existing elements to arrive at the first empty slot. Thanks to the 2x growth strategy, the total time for N consecutive `push_back` calls is O(N), amortized O(1). +The target address of `memcpy` is `data + size * elem_size`—skipping all existing elements to arrive at the first empty slot. Thanks to the 2x growth strategy, the total time for N consecutive `push_back`s is O(N), amortizing to O(1). -Let's verify the growth behavior: +Let's verify the expansion effect: ```c -DynArray arr; -dynarray_create(&arr, sizeof(int), 4); +struct DynamicArray arr; +array_create(&arr, sizeof(int), 4); // Requested 4, adjusted to 8 for (int i = 0; i < 20; i++) { - dynarray_push_back(&arr, &i); + array_push_back(&arr, &i); + printf("Size: %zu, Cap: %zu\n", arr.size, arr.capacity); } -printf("size=%d, capacity=%d\n", arr.size, arr.capacity); +array_destroy(&arr); ``` +Output: + ```text -size=20, capacity=32 +Size: 1, Cap: 8 +... +Size: 8, Cap: 8 +Size: 9, Cap: 16 <-- Expanded +... +Size: 16, Cap: 16 +Size: 17, Cap: 32 <-- Expanded +... ``` -The initial capacity of 4 is raised to the minimum of 8, and after inserting 20 elements it undergoes two growths: 8 -> 16 -> 32. +The initial capacity of 4 was bumped to 8. After inserting 20 elements, it underwent two expansions: 8 -> 16 -> 32. -### pop_back — Remove from the End +### pop_back — Remove from Tail ```c -DynResult dynarray_pop_back(DynArray *arr) { - if (!arr) return DYN_ERR_NULL_PTR; - if (arr->size == 0) return DYN_ERR_OUT_OF_RANGE; +ArrayResult array_pop_back(struct DynamicArray* arr) { + if (arr->size == 0) return ARR_ERR_INVALID_ARG; arr->size--; - return DYN_OK; + return ARR_OK; } ``` -The "deleted" element still lies in memory, and will be overwritten on the next `push_back`. +The "deleted" element remains in memory and will be overwritten by the next `push_back`. > ⚠️ **Pitfall Warning** -> We don't trigger shrinkage after `pop_back` — if you `push_back` again right after `pop_back`, the shrinkage was done for nothing. Shrinkage should be explicitly called by the user via `shrink_to_fit`. `std::vector::pop_back` follows the same design. +> We do not trigger shrinking after `pop_back`—if we `push_back` right after `pop_back`, the shrink was wasted. Shrinking should be explicitly called by the user via `shrink_to_fit`. `std::vector` follows the same design. ### insert and erase — Middle Insertion and Deletion -`insert` needs to shift all elements after the insertion point back by one position, while `erase` shifts them forward by one to overwrite the deleted element. Both must use `memmove` instead of `memcpy` — because the source and destination memory regions overlap, `memcpy`'s behavior in overlapping situations is undefined. +`insert` needs to shift elements after the insertion position back by one, while `erase` shifts them forward by one to overwrite the deleted element. Both must use `memmove` rather than `memcpy`—because the source and destination memory regions overlap, and `memcpy`'s behavior is undefined in cases of overlap. ```c -DynResult dynarray_insert(DynArray *arr, int index, const void *elem) { - if (!arr || !elem) return DYN_ERR_NULL_PTR; - if (index < 0 || index > arr->size) return DYN_ERR_OUT_OF_RANGE; +ArrayResult array_insert(struct DynamicArray* arr, size_t index, const void* value) { + if (index > arr->size) return ARR_ERR_OUT_OF_BOUNDS; if (arr->size == arr->capacity) { - DynResult r = dynarray_grow(arr); - if (r != DYN_OK) return r; + ArrayResult res = array_reserve(arr, arr->capacity * 2); + if (res != ARR_OK) return res; } - // 把 index 之后的元素后移一位 - void *src = (char *)arr->data + index * arr->elem_size; - void *dst = (char *)arr->data + (index + 1) * arr->elem_size; - size_t move_size = (arr->size - index) * arr->elem_size; - memmove(dst, src, move_size); + void* target = (char*)arr->data + index * arr->elem_size; + void* src = (char*)arr->data + (index + 1) * arr->elem_size; + size_t count = (arr->size - index) * arr->elem_size; - // 在 index 位置写入新元素 - memcpy(src, elem, arr->elem_size); + memmove(src, target, count); // Shift elements back + memcpy(target, value, arr->elem_size); // Write new element arr->size++; - return DYN_OK; + return ARR_OK; } -DynResult dynarray_erase(DynArray *arr, int index) { - if (!arr) return DYN_ERR_NULL_PTR; - if (index < 0 || index >= arr->size) return DYN_ERR_OUT_OF_RANGE; +ArrayResult array_erase(struct DynamicArray* arr, size_t index) { + if (index >= arr->size) return ARR_ERR_OUT_OF_BOUNDS; - // 把 index 之后的元素前移一位 - void *dst = (char *)arr->data + index * arr->elem_size; - void *src = (char *)arr->data + (index + 1) * arr->elem_size; - size_t move_size = (arr->size - index - 1) * arr->elem_size; - memmove(dst, src, move_size); + void* target = (char*)arr->data + index * arr->elem_size; + void* src = (char*)arr->data + (index + 1) * arr->elem_size; + size_t count = (arr->size - index - 1) * arr->elem_size; + memmove(target, src, count); // Shift elements forward arr->size--; - return DYN_OK; + return ARR_OK; } ``` -Verifying insert and erase: +Verify insert and erase: ```c int val = 99; -dynarray_insert(&arr, 2, &val); // 在下标 2 处插入 99 -dynarray_erase(&arr, 5); // 删除下标 5 的元素 -``` - -```text -insert OK: size=21, capacity=32 -erase OK: size=20, capacity=32 +array_insert(&arr, 2, &val); // Insert 99 at index 2 +array_erase(&arr, 0); // Remove element at index 0 ``` -`std::vector::insert` has had an rvalue reference overload since C++11, accepting move semantics to avoid deep copies. Our C version can only do a shallow copy via `memcpy` — if elements contain dynamically allocated memory (such as strings pointing to `malloc`-allocated buffers), shallow copies will lead to double free crashes. This is a fundamental limitation of generic programming in C. +`std::vector::insert` has an rvalue reference overload in C++11, allowing move semantics to avoid deep copies. Our C version can only do shallow copies via `memcpy`—if an element contains dynamically allocated memory (like a string pointing to `malloc`'d memory), a shallow copy leads to double free crashes. This is a fundamental limitation of generic programming in C. ## Step 7 — Implement Traversal and Search -### Traversal — The Callback Function Pattern +### Traversal — Callback Function Pattern -The container's interior is `void *`, so it doesn't know the element type. Therefore, "how to process each element" needs to be told to the container by the caller via a callback function — a form of "inversion of control": +The container internals are `void*`, so it doesn't know the element type. Thus, "how to process each element" needs to be told to the container by the caller via a callback function—a form of "Inversion of Control": ```c -typedef void (*DynArrayVisitor)(void *elem, void *context); +typedef void (*ElementCallback)(void* element, void* user_data); -void dynarray_foreach(DynArray *arr, DynArrayVisitor visitor, void *context) { - if (!arr || !visitor) return; - for (int i = 0; i < arr->size; i++) { - void *elem = (char *)arr->data + i * arr->elem_size; - visitor(elem, context); +void array_foreach(struct DynamicArray* arr, ElementCallback func, void* user_data) { + for (size_t i = 0; i < arr->size; i++) { + func((char*)arr->data + i * arr->elem_size, user_data); } } ``` +Usage: + ```c -void print_int(void *elem, void *ctx) { - (void)ctx; - int val; - memcpy(&val, elem, sizeof(int)); - printf("%d ", val); +void print_int(void* elem, void* user_data) { + (void)user_data; // Unused + printf("%d ", *(int*)elem); } -// 使用 -dynarray_foreach(&arr, print_int, NULL); -printf("\n"); -``` - -```text -0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +array_foreach(&arr, print_int, NULL); ``` -The callback function pattern is used extensively in the C standard library — `qsort`'s comparison function and `bsearch` both follow this pattern. +The callback function pattern is widely used in the C standard library—the comparison function in `qsort`, and `pthread_create` all follow this routine. ### Search — Linear Search "Comparing for equality" also needs to be provided by the caller: ```c -typedef bool (*DynArrayEqual)(const void *elem, const void *target); - -int dynarray_find(DynArray *arr, const void *target, DynArrayEqual equal) { - if (!arr || !target || !equal) return -1; - for (int i = 0; i < arr->size; i++) { - void *elem = (char *)arr->data + i * arr->elem_size; - if (equal(elem, target)) return i; +typedef bool (*EqualPredicate)(const void* elem, void* user_data); + +ArrayResult array_find(const struct DynamicArray* arr, EqualPredicate pred, void* user_data, size_t* out_index) { + for (size_t i = 0; i < arr->size; i++) { + if (pred((char*)arr->data + i * arr->elem_size, user_data)) { + *out_index = i; + return ARR_OK; + } } - return -1; + return ARR_ERR_NOT_FOUND; } ``` -Time complexity is O(N). If you need faster lookups, you can sort first and then use binary search. C++'s `std::find` uses iterators paired with lambda expressions, which is far more elegant to write than callback functions; C++20's Ranges turn traversal, filtering, and transformation into chained calls. +Time complexity is O(N). If you need it faster, you can sort first and then use binary search. C++'s `std::find_if` uses iterators combined with lambda expressions, which is much more elegant to write than callback functions; C++20 Ranges turns traversal, filtering, and transformation into chained calls. -## C++ Comparison: std::vector's Design Trade-offs +## C++ Comparison: Design Trade-offs in std::vector -At this point we've hand-rolled a complete dynamic array library. Looking back and systematically comparing with `std::vector`, understanding these design trade-offs is far more important than memorizing APIs. +At this point, we have hand-rolled a complete dynamic array library. Looking back systematically at `std::vector`, understanding these design trade-offs is far more important than memorizing APIs. -Using `void *` for generic programming brought us three problems: no type checking, needing to manually pass `elem_size`, and requiring casts inside callback functions. `std::vector` solves all three perfectly with templates — the compiler determines the type `T` at instantiation time, all type checks are completed at compile time, and `elem_size` is automatically calculated. `std::vector`'s destructor automatically frees the internal array, whether the function returns normally or exits due to an exception — this is the core idea of RAII: binding resource lifetime to object lifetime. C++11's move semantics make `std::vector` moves an O(1) pointer swap, whereas in C we can only `memcpy` the entire data block. +We used `void*` to implement generics, which brought three problems: no type checking, manual passing of `elem_size`, and mandatory type casting in callback functions. `std::vector` uses templates to perfectly solve these three—the compiler determines type `T` upon instantiation, all type checks are completed at compile time, and `sizeof(T)` is calculated automatically. `std::vector`'s destructor automatically releases the internal array, whether the function returns normally or exits via an exception. This is the core idea of RAII—binding resource lifecycle to object lifecycle. C++11's move semantics make `std::vector` return an O(1) pointer swap, whereas in C, you can only `memcpy` the entire block of data. -There are two easily confused functions: `reserve` only changes `capacity` without changing `size`, pre-allocating memory without creating new elements; `resize` changes `size`, with extra positions value-initialized and excess elements destroyed. Our C version only implements `reserve`, leaving `resize` as an exercise. Additionally, `std::vector` uses bit-packing optimization (each element only takes 1 bit), but the trade-off is that you can't take the address of individual elements. C++17's `std::span` provides a non-owning view over contiguous memory and is an extremely important composition tool. +There are two easily confused functions: `reserve` only changes `capacity` not `size`, pre-allocating memory without creating new elements; `resize` changes `size`, filling extra positions with value-initialized values and destructing excess elements. Our C version only implemented `reserve`; `resize` is left as an exercise. Also, `std::vector` applies bit compression optimization (each `bool` takes only 1 bit), but at the cost of not being able to take the address of individual elements. C++17's `std::span` provides a non-owning view of contiguous memory and is a very important composition tool. ## Exercises -The following exercises only provide function signatures and requirement descriptions, with the implementation left blank. +The following exercises provide only function signatures and requirement descriptions. The implementation is left blank. ### Exercise 1: Implement resize -`reserve` only changes capacity without changing size, while `resize` needs to change size. When the new size is greater than the old size, the extra positions should be filled with default values. +`reserve` only changes capacity, not size, while `resize` needs to change size. When the new size is larger than the old size, the extra positions should be filled with a default value. ```c -DynResult dynarray_resize(DynArray *arr, int new_size, const void *default_val); +ArrayResult array_resize(struct DynamicArray* arr, size_t new_size, const void* default_value); ``` ### Exercise 2: Implement filter -Given a dynamic array and a filter predicate, return a newly created dynamic array containing only the elements that satisfy the condition. +Given a dynamic array and a filter predicate, return a newly created dynamic array containing only elements that satisfy the condition. ```c -DynResult dynarray_filter(DynArray *src, DynArray *dst, - bool (*predicate)(const void *elem, void *ctx), - void *ctx); +ArrayResult array_filter(const struct DynamicArray* src, struct DynamicArray* dest, bool (*predicate)(const void* elem)); ``` ### Exercise 3: Implement map transformation @@ -457,9 +444,7 @@ DynResult dynarray_filter(DynArray *src, DynArray *dst, Given a dynamic array and a transformation function, apply the transformation function to each element and store the results in a new array to return. ```c -DynResult dynarray_map(DynArray *src, DynArray *dst, - void (*transform)(void *dst_elem, const void *src_elem, void *ctx), - void *ctx); +ArrayResult array_map(const struct DynamicArray* src, struct DynamicArray* dest, void (*transform)(void* out_elem, const void* in_elem)); ``` ### Exercise 4: Implement concatenation @@ -467,12 +452,12 @@ DynResult dynarray_map(DynArray *src, DynArray *dst, Concatenate two dynamic arrays of the same type into a new dynamic array. ```c -DynResult dynarray_concat(DynArray *a, DynArray *b, DynArray *out); +ArrayResult array_concat(const struct DynamicArray* a, const struct DynamicArray* b, struct DynamicArray* result); ``` -> **Difficulty Self-Assessment**: If you find the exercises difficult, please review the design思路 of the corresponding sections. Especially for resize — it's essentially a combination of reserve + memset/memcpy. Once you think through which positions need filling and what values to fill them with, the code will naturally follow. +> **Self-Assessment of Difficulty**: If you find the exercises difficult, please review the design ideas in the corresponding sections. Especially `resize`—it is essentially a combination of `reserve` + `memset`/`memcpy`. Once you figure out which positions need filling and what values to fill, the code will come naturally. -## References +## Reference Resources - [cppreference: std::vector](https://en.cppreference.com/w/cpp/container/vector) - [cppreference: realloc](https://en.cppreference.com/w/c/memory/realloc) diff --git a/documents/en/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md b/documents/en/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md index 9a55480dc..a42032b35 100644 --- a/documents/en/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md +++ b/documents/en/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md @@ -1,7 +1,7 @@ --- title: Type Safety, Number Constraints, and Bounds Checking -description: CppCon 2025 talk notes — from implicit narrowing conversions to the `Number` - wrapper type, then to `safe_int` and `checked_span` +description: CppCon 2025 Talk Notes — From implicit narrowing conversions to Number + wrappers, and then to safe_int and checked_span conference: cppcon conference_year: 2025 talk_title: Concept-based Generic Programming @@ -21,24 +21,24 @@ chapter: 1 order: 1 translation: source: documents/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md - source_hash: 707b662969a6dfc9b5be3d5075758f4a2e72ec44b6ac4d8a7e81ec88dd260f19 - translated_at: '2026-05-26T11:04:45.373800+00:00' + source_hash: 1aad64ff7c3d5c3b94fb383a5778e0a13f491a51c687b3e2836c07f1ad7a9ceb + translated_at: '2026-06-13T11:45:25.430298+00:00' engine: anthropic - token_count: 8899 + token_count: 8929 --- # From Manual Checks to Implicit Guards :::tip -As a side note, this section is an extended exploration based on a CppCon talk. The link above points to their video series on YouTube; users in China can watch via the Bilibili link. +A quick note: this section is an expansion based on CppCon talks. The links above point to their video series on YouTube; users in China can watch via the Bilibili links. ::: -C++ generic programming traces back to 1991 when templates were introduced to the language (C++ Release 3.0). Stroustrup's primary motivation for designing templates was to replace C preprocessor macros with type-safe generic containers. In *The Design and Evolution of C++*, he wrote that macros "fail to obey scope and type rules and don't interact well with tools," while templates were designed to be "as efficient as macros" but type-safe. +Generic programming in C++ dates back to 1991 when templates were introduced to the language (C++ Release 3.0). Stroustrup's primary motivation for designing templates was to replace C preprocessor macros to implement type-safe generic containers. In *The Design and Evolution of C++*, he wrote that macros "fail to obey scope and type rules and don't interact well with tools," whereas templates were designed to be "as efficient as macros" but type safe. -But the story took an unexpected turn in 1994. Erwin Unruh presented a perfectly legal C++ program at a C++ committee meeting—one that wouldn't even compile, yet caused the compiler to output a sequence of prime numbers line by line in its error messages. The entire committee suddenly realized that templates had inadvertently formed a Turing-complete compile-time computation system. The following year, Todd Veldhuizen published a paper systematically describing this technique and named it **Template Metaprogramming**. Templates thus evolved from a "type-safe macro replacement" into an indispensable compile-time abstraction mechanism in C++. +But the story took an unexpected turn in 1994. Erwin Unruh presented a legal C++ program at a C++ committee meeting that wouldn't even compile, yet the compiler output a sequence of prime numbers line by line in the error messages. The entire committee then realized that templates had inadvertently constituted a Turing-complete compile-time computation system. The following year, Todd Veldhuizen published a paper systematically describing this technique, naming it **Template Metaprogramming**. Thus, templates evolved from a "type-safe macro replacement" to an indispensable compile-time abstraction mechanism in C++. -Template error messages routinely span hundreds of lines and are notoriously unreadable—this is why many C++ developers shy away from generic programming. But as project scale grows, code without generics becomes so repetitive that it's unmaintainable. In this article, we start from the foundational motivation of generic programming and work our way to a concrete, actionable type safety problem—implicit narrowing conversions. +Template error messages often span hundreds of lines and are notoriously unreadable—this is why many C++ developers shy away from generic programming. However, as project scale grows, code without generics becomes too repetitive to maintain. In this article, we start from the basic motivations of generic programming and arrive at a concrete, actionable type safety issue—implicit narrowing conversion. -The experimental environment for this article is Arch Linux WSL, GCC 16.1.1. Here is the environment info: +The experimental environment for this article is Arch Linux WSL, GCC 16.1.1. Here is the environment information: ```bash ❯ gcc -v @@ -56,21 +56,21 @@ Linux Charliechen 6.6.114.1-microsoft-standard-WSL2 #1 SMP PREEMPT_DYNAMIC Mon D ``` -## First, Let's Clarify What Generic Programming Actually Aims to Do +## First, let's clarify what generic programming aims to do -The effect of generic programming is to make code more generic and more abstract—this is only half true. Alex Stepanov (the father of the STL) points out that the goal of generic programming is "to express ideas in the most generic, most efficient, and most flexible way," and the key is expressing ideas, not abstracting for abstraction's sake. Treating the means as the end is a common pitfall in programming—another typical example is the abuse of design patterns. +The effect of generic programming is to make code more general and more abstract—this is only half right. Alex Stepanov (father of the STL) pointed out that the goal of generic programming is to "express ideas in the most general, most efficient, and most flexible way." The key is expressing ideas, not abstraction for abstraction's sake. Treating means as ends is a common pitfall in programming—another typical example is the abuse of design patterns. -This distinction matters. We don't start from some abstract model to design code; rather, we start from concrete, efficient algorithms, discover the commonalities within them, and then extract those commonalities. And we can't sacrifice performance, because a large part of C++'s reason for existing lies right there. As hardware gets faster, our expectations for software are growing just as rapidly, while semiconductor processes seem to have hit a bottleneck. The room for writing careless code is shrinking. +This distinction is important. We don't design code starting from an abstract model; instead, we start from concrete, efficient algorithms, discover commonalities, and then extract them. Moreover, performance cannot be sacrificed, as a significant part of C++'s existence relies on this. As hardware gets stronger, our expectations for software expand rapidly, yet semiconductor processes seem to have hit a bottleneck, leaving less and less room for sloppy code. -Generic programming demands more from us: it requires us to discern reusable patterns within abstract domains. And its bottom line is—after abstraction, performance must not be worse than a hand-written concrete version. Otherwise, there's no point in introducing generic programming. Writing code itself belongs to the "getting work done" layer of the needs hierarchy; we don't do extra things. If something won't be reused and is performance-sensitive, don't introduce generics there. +Generic programming demands more from us: it requires us to perceive reusable patterns in abstract domains. Its bottom line is—after abstraction, performance must not be worse than a hand-written concrete version. Otherwise, there is no point in introducing generic programming. Writing code itself belongs to the "getting the job done" layer of the需求 hierarchy; do not do extra work. If a certain part won't be reused and is sensitive to performance, don't introduce generics. -## Alex Stepanov's Design Criteria for C++ +## Alex Stepanov's C++ Design Standards -Around 1994, Stepanov proposed three design criteria: first, generality—good generic components should be able to express use cases that even their designers hadn't thought of; second, uncompromising efficiency—when writing system-level code in C++, efficiency should match C, and when writing linear algebra, it should match Fortran; third, statically typed interfaces—checked at compile time, not leaving errors for runtime. Later he added two very down-to-earth requirements: compile time shouldn't be so long that you go grab a coffee (header-only libraries make this hard to guarantee), and the learning curve shouldn't be so steep that you need an MIT PhD to get started—as for whether C++ has actually achieved this, we all know the answer. +Around 1994, Stepanov proposed three design standards: first is generality, where good generic components should express usages even the designer hadn't thought of; second is uncompromised efficiency, where writing system-level code in C++ should match C, and writing linear algebra should match Fortran; third is statically typed interfaces, where checks happen at compile time, not leaving errors to runtime. Later, he added two very practical requirements: compile time shouldn't be so long that one goes for coffee (header-only libraries find this hard to guarantee), and the learning curve shouldn't be so steep that it requires a MIT PhD to get started—as for whether C++ achieved this, we all have our own thoughts. -## Implicit Narrowing Conversions: A Classic Type Safety Trap +## Implicit Narrowing Conversion: A Classic Type Safety Trap -That covers the motivation. Let's start with a concrete problem. The introduction of a concept must have a corresponding problem scenario, otherwise it's a castle in the air. Look at this code: +With the motivation covered, let's start with a specific problem. The introduction of a concept must have a corresponding problem scenario, otherwise it's a castle in the air. Look at this code: ```cpp #include @@ -92,19 +92,19 @@ int main() { } ``` -This code uses pre-C++23 syntax to ensure it compiles directly on all compilers. +This code uses C++23 syntax to ensure all compilers can compile it directly. -On my machine, the result is `overflow = -25536`, `int_pi = 3`. The compiler doesn't produce a single warning (unless you enable `-Wall -Wextra`, but many projects don't). This kind of bug is particularly insidious: the code runs, the results are just wrong, and it often doesn't surface with small data volumes—only blowing up after going to production. +On my machine, the result is `overflow = -25536`, `int_pi = 3`. The compiler doesn't give a single warning (unless you enable `-Wall -Wextra`, but many projects don't). This kind of bug is particularly insidious: the code runs, but the result is wrong, and it often doesn't reveal itself with small data sets, only surfacing after deployment. -Many people think "this is just a C++ feature, just be careful." But relying on human carefulness for this kind of thing is unreliable. Bjarne Stroustrup himself has said that he wanted to fix this problem back then but couldn't, and the C camp wouldn't allow changes either. So as users, can we guard against it ourselves? +Many people think "this is just a C++ feature, just be careful." But relying on human diligence is unreliable. Bjarne Stroustrup himself said he wanted to solve this problem back then but couldn't, and the C language camp wouldn't budge. So as users, can we defend against it ourselves? -## Modeling "Numbers" with C++20 Concepts +## Using C++20 Concepts to Model "Numbers" -C++20 gives us a new weapon: concepts. Its essence is simple—a concept is a compile-time evaluated Boolean predicate that takes a type as input and outputs true or false. In other words: it lets the compiler understand a "concept" without us having to describe it in complex natural language. +C++20 gives us a new weapon: concepts. Its essence is simple—a concept is a compile-time evaluated boolean predicate, taking a type as input and outputting true or false. Put another way: it lets the compiler understand a "concept" without us needing to describe it in complex natural language. -The standard library already defines some basic concepts, such as `std::integral` and `std::floating_point`, which determine whether a type is an integer type or a floating-point type. These aren't new inventions—the first edition of K&R C was already distinguishing int and float, except now we have a language-level, compile-time queryable representation. +The standard library already defines some basic concepts, such as `std::integral` and `std::floating_point`, which judge whether a type is an integer type or a floating-point type. These aren't new inventions; the first edition of K&R C distinguished int and float, but now we have a language-level, compile-time queryable representation. -Let's first write the simplest concept to express the idea of a "number": +Let's first write a simple concept to express the idea of a "number": ```cpp #include @@ -121,17 +121,17 @@ static_assert(number, "char 也是整数类型,所以是 number"); static_assert(!number, "string 不是 number"); ``` -There's a syntax detail worth explaining here: `std::integral` looks like a function call, but it isn't. `std::integral` is a concept, `` instantiates it with type T, and the value of the entire expression is a compile-time bool. You can't write `std::integral(T)`—that syntax is wrong. Just understand it as "run the integral test on T," returning true or false. +There is a syntactic detail worth explaining here: `std::integral` looks like a function call, but it isn't. `std::integral` is a concept, `` instantiates it with type T, and the value of the entire expression is a compile-time bool. You cannot write `std::integral(T)`, that syntax is wrong. Just understand it as "perform the integral test on T", returning true or false. -Run the code above, and all four `static_assert` assertions pass, showing that our `number` concept basically works. +Running the code above, all four `static_assert` assertions pass, indicating our `number` concept basically works. -## Writing a narrowing Check Ourselves +## Writing a narrowing Judgment by Hand -Can we write a concept that determines "when assigning a value of type U to type T, will a narrowing conversion occur"? Since we're writing this article, let's give it a shot. +Can we write a concept to judge "when assigning a value of type U to type T, will a narrowing conversion occur"? Since I'm writing this article. -First, if T's representable range is smaller than U's, then narrowing is obviously possible. For example, assigning `int` to `short`—`int` can represent far more values than `short`. But how do we determine "smaller range"? The C++ standard library doesn't directly give us a "range of a type" concept, but `` has `std::numeric_limits`, where we can look up the min and max of various types. If U is floating-point and T is an integer, the fractional part will definitely be lost, which is also narrowing. +First, if T's representable range is smaller than U's, narrowing is obviously possible. For example, assigning `int` to `short`, `int` can represent many more values than `short`. But how do we judge "smaller range"? The C++ standard library doesn't directly give us a concept like "range of a type", but `` has `std::numeric_limits`, which can query the min and max of various types. If U is floating-point and T is an integer, the fractional part will definitely be lost, which is also narrowing. -There's another easily overlooked case: U and T are both integers, the same size (say both 32-bit), but one is signed and the other is unsigned—then assigning a negative number to an unsigned type will also cause problems. Let's write these rules as code: +There is another easily overlooked situation: U and T are both integers, the size is the same (e.g., both 32-bit), but one is signed and the other is unsigned. Assigning a negative number to an unsigned type will cause problems. Writing these rules into code: ```cpp #include @@ -172,35 +172,35 @@ static_assert(!narrowing_assign, "float -> double 不是窄化"); static_assert(!narrowing_assign, "int -> int 不是窄化"); ``` -Compile and run it, and all six `static_assert` assertions pass. We can use the last `!narrowing_assign` to verify the logic: for same-type assignment, in case 1's `smaller_range`, `max() < max()` is false and `min() > min()` is also false, so it doesn't trigger; case 2 requires U to be floating-point and T to be an integer, which isn't satisfied; case 3 requires different signedness, and `int` and `int` are obviously the same. All three branches are false, the whole thing is false, and after negation `static_assert` passes—this perfectly matches our intuition that "same-type assignment doesn't narrow." +Compile and run, all six `static_assert` assertions pass. We can use the last `!narrowing_assign` to verify the logic: assigning the same type, in case 1, `smaller_range` `max() < max()` is false, `min() > min()` is also false, so it doesn't trigger; case 2 requires U to be floating and T to be integer, which isn't satisfied; case 3 requires different signedness, `int` and `int` are obviously the same. All three branches are false, the whole thing is false, and after negation `static_assert` passes—this matches our intuition that "same type assignment isn't narrowing". -One more thing worth mentioning: where `narrowing_assign` mixes `&&` and `||`, parentheses are mandatory. Because `&&` has higher precedence than `||`, without parentheses, `number && number` would only constrain the first `||` branch, and the latter two branches might still be evaluated for non-number types—although the results happen to be correct for the current test cases, the semantics would be wrong. Adding parentheses makes the three branches a single unit, then uniformly constrained by `number && number`, making the logic rigorous. +Another point worth mentioning: where `&&` and `||` are mixed in `narrowing_assign`, parentheses must be added. Because `&&` has higher precedence than `||`, without parentheses, `number && number` would only constrain the first `||` branch, and the latter two branches might be evaluated on non-number types—although the result happens to be correct for current test cases, semantically it's wrong. Adding parentheses makes the three branches a whole, then uniformly constrained by `number && number`, making the logic rigorous. -## Some Edge Cases to Think Through +## Some Edge Cases Need to Be Clear -The implementation above covers most scenarios, but there are some details worth discussing. For example, conversions between floating-point types: does `double` to `float` count as narrowing? From a precision perspective, of course it does, because `double` can represent more significant digits than `float`. But in the current implementation, `smaller_range` will evaluate `numeric_limits::max() < numeric_limits::max()` as true, so it will be correctly identified as narrowing. +The implementation above covers most scenarios, but there are details worth mentioning. For example, conversion between floating-point numbers: `double` to `float`, does it count as narrowing? From a precision perspective, of course, because `double` can represent more significant digits than `float`. But in the current implementation, `smaller_range` will judge `numeric_limits::max() < numeric_limits::max()`, which is true, so it will be correctly identified as narrowing. -Another case is `char` to `unsigned char`. The signedness of `char` is implementation-defined (signed on some platforms, unsigned on others). If `char` is signed on your platform, then `signed_integral != signed_integral` is true, and it will be identified as narrowing. This is actually reasonable, because if `char` is -1, assigning it to `unsigned char` would become 255. +Another example is `char` to `unsigned char`. The signedness of `char` is implementation-defined (signed on some platforms, unsigned on others). If `char` is signed on the platform, then `signed_integral != signed_integral` is true, and it will be identified as narrowing. This is actually reasonable, because if `char` is -1, assigning it to `unsigned char` becomes 255. -Note, however, that this implementation isn't 100% rigorous. The standard's definition of narrowing conversion (in C++11's list initialization rules) is more nuanced than what's written here—for instance, it also considers whether a floating-point-to-integer value falls within the integer's range. But as a starting point, this concept can already help us avoid most pitfalls. We can refine it gradually over time. +However, note that this implementation isn't 100% rigorous. The standard's definition of narrowing conversion (in the C++11 list initialization rules) is more detailed than what's written here, for example, considering whether the value is within the integer range when converting from floating-point to integer. But as a starting point, this concept can already block most pitfalls. We can improve it gradually. -At this point, we can summarize one thing: concepts aren't some mysterious metaprogramming technique—they're simply a mechanism for "writing constraints on types as compile-time checkable Boolean expressions." In the past, when writing templates, constraints relied entirely on documentation and naming conventions (like "please pass a random-access iterator"), and the compiler didn't care—if you passed the wrong type, you'd get pages of incomprehensible errors. Now with concepts, the compiler can tell you immediately "the type you passed doesn't satisfy the requirements," and the error messages are actually human-readable. +At this point, we can summarize one thing: concepts aren't some profound metaprogramming trick, they are just a mechanism to "write constraints on types as compile-time checkable boolean expressions". Previously, writing templates meant relying on documentation and naming conventions (e.g., "please pass a random access iterator") for constraints, the compiler didn't care, and if you passed the wrong thing, you got a pile of gibberish. Now with concepts, the compiler can tell you "the type you passed doesn't meet the requirements" immediately, and the error message is human-readable. -The next step is to use this `narrowing_assign` concept in actual functions to create a safe assignment wrapper—that's the content of the next section. At the very least, the core idea of "using concepts to express type constraints" is now clear. +The next step is to apply this `narrowing_assign` concept to actual functions to make a safe assignment wrapper—that's the content of the next section. At least the core idea of "using concepts to express type constraints" is sorted out here. --- -# From Manual Checks to Implicit Guards: Baking Narrowing Conversion Checks into Types +# From Manual Checks to Implicit Guards: Stuffing Narrowing Checks into Types -In the previous section, we figured out the rules for determining narrowing conversions. If you had to run through those rules in your head every time you write code, it would be practically impossible—when signed and unsigned are mixed, which one is bigger, will it overflow, can the positive part be represented, just thinking about these is enough to make your head spin. The speaker said that writing this out manually takes about a page of code, and it's messy and tricky. +In the previous section, we figured out the rules for judging narrowing conversion. It's almost impossible to run those rules through your head every time you write code—when signed and unsigned are mixed, which one is bigger, will it overflow, can the positive part be represented, just thinking about these is dizzying. The speaker said writing this out by hand takes about a page of paper, and it's messy and tricky. -So what this section needs to do is: turn that page of messy logic into actually runnable code, and then hide it away so that when you write code day to day, you don't even notice its existence. +So the task for this section is: turn that page of messy logic into real running code, and then hide it so you don't feel its existence when writing code normally. ## First, Translate the Judgment Logic into Code -One intuition is: to determine whether assigning a value from type U to type T will cause narrowing, just use a `static_cast` and compare. But think carefully—that's not how it works at all—when signed and unsigned are mixed, the comparison itself has traps. So we need an honest, step-by-step function. +An intuition is: to judge whether assigning a value from type U to type T will cause narrowing, just use a `static_cast` and compare. But thinking carefully, it's not that simple at all—when signed and unsigned are mixed, the comparison itself has traps. So we need an honest, step-by-step function. -The idea is: do as much elimination work as possible at compile time, filtering out the cases where narrowing "absolutely cannot happen," leaving only the paths that truly need runtime checks. This is actually what generic programming has always emphasized—don't do work at runtime that shouldn't be done there. +The idea is: do as much exclusion work as possible at compile time, filtering out those situations where "narrowing absolutely cannot happen", leaving only the paths that truly need runtime checks. This is actually what generic programming emphasizes—don't do work at runtime that shouldn't be done. ```cpp #include @@ -276,13 +276,13 @@ constexpr bool would_narrow(U u) noexcept { } ``` -Looking back after writing this function, when signed and unsigned are mixed, how much can be eliminated at compile time and how much must be checked at runtime—that boundary really does require careful thought. There's an easy trap to fall into: simply using round-trip (convert there and back) to detect narrowing fails on signed→unsigned conversions—because `int(-1) → unsigned(4294967295) → int(-1)` is perfectly reversible in two's complement, so round-trip detection won't catch it. So you must explicitly check "is the source value negative" before the round-trip. `if constexpr` plays a key role here—branches that can be determined at compile time won't generate any code at all, so there won't be a bunch of useless comparison instructions. +Looking back at this function, the boundary between how much can be excluded at compile time and how much must be checked at runtime when signed and unsigned are mixed really needs careful thought. There's a pitfall easy to step into: simply using round-trip (convert there and back) to detect narrowing fails during signed→unsigned conversion—because `int(-1) → unsigned(4294967295) → int(-1)` is completely reversible in two's complement, round-trip can't detect it. So you must explicitly check "is the source value negative" before the round-trip. `if constexpr` plays a key role here—branches determined at compile time won't generate code at all, so there won't be a bunch of useless comparison instructions. -## What to Do When Narrowing Occurs? Throw an Exception +## What to do when narrowing happens? Throw an Exception -We have the judgment logic. Next, we need to decide: how do we handle it when narrowing is detected? +With the judgment logic, the next decision is: how to handle it after detecting narrowing? -The speaker's approach is very direct—throw an exception. After compile-time filtering, the probability of narrowing actually triggering at runtime is extremely low. In most code, types match and are eliminated at compile time; among the remaining cases that need runtime checks, the vast majority won't actually overflow. It might trigger once in a million calls, and this is exactly the scenario where exceptions excel—handling extremely rare exceptional situations. +The speaker's solution is very direct—throw an exception. After compile-time filtering, the probability of narrowing actually triggering at runtime is extremely low. In most code, types match, and they are excluded at compile time; for those remaining that need runtime checks, the vast majority won't actually overflow. Maybe it triggers once in a million calls, which is exactly the scenario exceptions excel at—handling extremely rare exceptional situations. ```cpp template @@ -336,11 +336,11 @@ Run it and see the output: a = 42, b = 100 ``` -Great, everything that should be caught is caught. But here's the problem—you can't write `narrow_convert(xxx)` at every assignment site. The code would become verbose, and there's no way to maintain consistency. Relying on programmers to diligently add checks will inevitably lead to missed cases. Some places will have them, some will be forgotten, and then bugs hide in those forgotten places. +Great, everything that should be blocked was blocked. But the problem arises—you can't write `narrow_convert(xxx)` at every assignment location. The code becomes verbose, and it's completely impossible to maintain consistency. Relying on programmers to consciously add checks will inevitably result in漏网之鱼. Some places have them, some are forgotten, and bugs hide in those forgotten places. -## Baking the Check into Types: Number +## Stuffing the Check into the Type: Number -So the real solution is—make the check implicit. Define a wrapper type `Number` that automatically performs narrowing checks upon construction. After that, use `Number` just like an ordinary `T`, without worrying about narrowing problems, because if construction can't pass, the object simply doesn't exist. +So the real solution is—make the check implicit. Define a wrapper type `Number` that automatically performs narrowing checks when constructed. After that, this `Number` is used just like a normal `T`, but you don't worry about narrowing problems, because if the construction doesn't pass, this object doesn't exist at all. ```cpp template @@ -363,7 +363,7 @@ public: }; ``` -You see, the class itself is just this much. It looks like demo code, but it actually works. Let's try it: +You see, this class itself has just this much stuff. It looks like demo code, but it really works. Let's try: ```cpp int main() { @@ -401,13 +401,13 @@ sum = 142 捕获到: narrowing conversion detected ``` -At this point, a key design idea becomes clear: we used to think of template metaprogramming and the type system as two separate things, but in reality, the type system itself is the best place to do checking. You don't need to remember where to check and where not to—just use `Number` instead of `T`, and the check happens automatically. And because of the compile-time `if constexpr` branches, paths that don't need checking (like same-type assignment) won't even generate judgment code—zero overhead. +At this point, a key design idea emerges: we used to think template metaprogramming and type systems were different things, but in fact, the type system itself is the best place to do checks. No need to remember where to check and where not to, just use `Number` instead of `T`, and the check happens automatically. And because of the compile-time `if constexpr` branch, paths that don't need checking (like same-type assignment) won't even generate judgment code—zero overhead. -## But Construction Alone Isn't Enough; We Need Arithmetic +## But Being Able to Construct Isn't Enough, It Needs Arithmetic -If a numeric type can only be constructed but can't do arithmetic, how is it different from a constant? So we need to add arithmetic operators to `Number`. But there's a question: what should `Number` plus `Number` return? You can't just return some arbitrary type; there needs to be a rule. +If a numeric type can only be constructed but not calculated, what's the difference between it and a constant? So we need to add arithmetic operators to `Number`. But there's a problem here: `Number` plus `Number` should return what? You can't just return a type, you need rules. -There's something in the standard library called `std::common_type` that does exactly this—given two types, it tells you what type to use for their arithmetic result. For example, `common_type_t` is `double`, and `common_type_t` is `unsigned int` on most platforms. Let's just use it: +There's a thing in the standard library called `std::common_type`, which does exactly this—given two types, telling you what type to use when doing arithmetic operations on them. For example, `common_type_t` is `double`, `common_type_t` is `unsigned int` on most platforms. We use it directly: ```cpp #include @@ -496,8 +496,8 @@ Output: 加法溢出捕获到: narrowing conversion detected ``` -:::warning Original text error correction: unsigned arithmetic overflow is not detected by narrow_convert -In the output above, the last line "addition overflow caught" will **not appear** in actual compilation and execution. Actual test results (GCC 16.1.1, C++20): +:::warning Original Text Error Correction: unsigned arithmetic overflow won't be detected by narrow_convert +In the output above, the last line "addition overflow caught" will **not appear** in actual compilation and running. Actual test result (GCC 16.1.1, C++20): ```text Raw unsigned sum: 705032704 @@ -505,9 +505,9 @@ Would narrow? 0 No exception thrown! overflow = 705032704 ``` -The reason is: arithmetic on `unsigned int + unsigned int` in C++ is **wrapping** (well-defined wrapping), and the result of `3000000000u + 2000000000u` is `705032704`—a legal `unsigned int` value. Subsequently, `narrow_convert(705032704u)` detects a same-type assignment, and `would_narrow` directly returns false, so the exception is never thrown. +The reason is: arithmetic operations of `unsigned int + unsigned int` in C++ are **wrapping** (well-defined wrapping), the result of `3000000000u + 2000000000u` is `705032704`—a legal `unsigned int` value. Subsequently, `narrow_convert(705032704u)` detects same-type assignment, `would_narrow` directly returns false, and the exception isn't thrown at all. -This is a fundamental limitation of the current design of `Number`: `narrow_convert` can only detect **narrowing conversions during assignment**, not **overflow of arithmetic operations themselves**. To detect overflow, you need to use compiler built-in functions (such as `__builtin_add_overflow`) or manual checks: +This is a fundamental limitation of the current `Number` design: `narrow_convert` can only detect **narrowing conversions during assignment**, not **overflow of the arithmetic operation itself**. To detect overflow, you need to use compiler built-ins (like `__builtin_add_overflow`) or manual checks: ```cpp template @@ -528,22 +528,22 @@ constexpr T safe_add(T a, T b) { } ``` -See `code/volumn_codes/vol10/cppcon/2025/01-concept-based-generic-programming/01-06-overflow-not-caught.cpp` for verification code. +See verification code in [01-06-overflow-not-caught.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/01-concept-based-generic-programming/01-06-overflow-not-caught.cpp). ::: -Looking at the last overflow detection example—we need to note that `narrow_convert` can only intercept **narrowing during type conversions**. For overflow of same-type arithmetic operations themselves (like the wrapping of `unsigned int + unsigned int`), it's powerless. `common_type_t` is `unsigned int` itself, and the operation result has already wrapped into a legal value before being assigned to `Number`. To fully defend against arithmetic overflow, additional mechanisms are needed (like compiler built-in overflow checking functions), which is beyond the scope of `narrow_convert`'s responsibilities. +Looking at the last overflow capture example—we need to note that `narrow_convert` can only intercept narrowing **during type conversion**, it is powerless against overflow of the same-type arithmetic operation itself (like the wrapping of `unsigned int + unsigned int`). `common_type_t` is just `unsigned int` itself, the operation result has already wrapped into a legal value before being assigned to `Number`. To fully defend against arithmetic overflow, additional mechanisms are needed (like compiler built-in overflow check functions), which is outside the scope of `narrow_convert`. -At this point, from manual judgment rules, to runtime check functions, to exception handling strategies, to wrapper types and arithmetic operations, this thread is finally connected. The key is to understand these things as a complete narrowing defense system, not as isolated knowledge points. +At this point, from manual judgment rules, to runtime check functions, to exception handling strategies, to wrapper types and arithmetic operations, this line is finally connected. The key is to understand these things as a complete narrowing defense system, not isolated knowledge points. --- -# No Need to Reinvent the Wheel: Standard Library Function Objects + Eliminating Comparison Traps +# Don't Reinvent the Wheel: Standard Library Function Objects + Eliminating Comparison Traps -To implement a safe integer type, the intuitive approach is to hand-write all the addition, subtraction, multiplication, division, and comparison operators—just thinking about it is exhausting. But in reality, the standard library has long had `std::plus`, `std::multiplies`, and other function objects ready to go, each just a few lines of code, not some kind of black magic at all. Of course, reinventing the wheel is a traditional C++ art form. +To implement a safe integer type, intuitively you have to write addition, subtraction, multiplication, division, and comparison operations all by hand—just thinking about it is a headache. But actually, the standard library has long prepared `std::plus`, `std::multiplies` and other function objects, each just a few lines of code, not black magic at all. Of course, reinventing the wheel counts as a traditional C++ art form. -## First, Let's See How to Write the Operators +## First, See How to Write Operators -A common misconception is that to overload `operator+` and `operator*` for a custom type, you need to write a bunch of `friend` functions either inside the class or globally, with each function handling various edge cases. But actually, you just need to use the standard library's function objects. +A common misconception is: to overload `operator+`, `operator*` for a custom type, you have to write a bunch of `friend` functions inside or outside the class, handling various boundary conditions in each function. But actually, you just need to use the function objects from the standard library. ```cpp #include @@ -565,11 +565,11 @@ struct safe_int { }; ``` -You'll notice the key point here: `std::plus{}` is a function object, and when you call it, if a type conversion that shouldn't happen occurs (like mixing signed and unsigned), it will be intercepted by the rules we set up earlier. The operation logic itself doesn't need our attention—the standard library has already written it, and we just handle "intercepting" and "letting through." +You will find the key here is: `std::plus{}` is a function object, and when you call it, if an inappropriate type conversion happens (like mixing signed and unsigned), it will be blocked by the rules we set up earlier. The operation logic itself doesn't need worry, the standard library has already written it, we just handle "intercept" and "let pass". -## Comparison Operations: The Worst Danger Zone for Signed/Unsigned Mixing +## Comparison Operations: The Heavy Disaster Area for Signed/Unsigned Mixing -Operator overloading itself isn't hard, but comparison operations are the real danger zone for signed/unsigned mixing. Spending a whole afternoon tracking down a bug, only to find it was a single comparison written wrong—this isn't uncommon. +Operator overloading itself isn't hard, but comparison operations are the real heavy disaster area for signed/unsigned mixing. Debugging a bug for a whole afternoon, only to find it was a wrong comparison line—this isn't uncommon. Look at this code: @@ -584,13 +584,13 @@ int main() { } ``` -Run it, and the output is `0`, which is `false`. A negative number is less than a positive number, yet the result is false? Why? The answer lies in one of C++'s implicit conversion rules—when signed and unsigned are mixed in a comparison, the signed value is converted to unsigned. So `-1` becomes a huge number (`4294967295`), which of course isn't less than 2. This rule has existed since C was born in 1972; at the time it might have seemed fine, but over the decades it has buried who knows how many bugs. +Run it, the output is `0`, which is `false`. Negative less than positive, but the result is actually false? Why? The answer is that C++'s implicit conversion rules have a rule—when signed and unsigned are mixed for comparison, the signed number is converted to an unsigned number. So `-1` becomes a huge number (`4294967295`), of course it's not less than 2. This rule has existed since C was born in 1972, maybe it seemed fine at the time, but over decades who knows how many bugs it buried. -As the speaker put it well: this rule should have been fixed in 1972, but by the time everyone realized how bad it was, there was already too much code in the world depending on this behavior, and it couldn't be changed. To this day, we're still suffering for it. +The speaker said it well: this rule should have been corrected in 1972, but by the time everyone realized how bad it was, there was too much code in the world relying on this behavior, and it couldn't be changed. To this day, we are still suffering for it. -## Fixing This Comparison Trap Ourselves +## Fixing the Comparison Trap by Hand -Since built-in types aren't reliable, let's take over comparison operations in our safe_int. The approach is straightforward: if the two sides have different types (one signed, one unsigned), do special handling first; if the types are the same, just do a normal comparison. +Since built-in types aren't reliable, let's take over comparison operations in our safe_int. The idea is straightforward: if the types on both sides are different (one signed, one unsigned), do a special judgment first; if types are the same, go directly to normal comparison. ```cpp template @@ -622,9 +622,9 @@ bool operator<(const safe_int& a, const safe_int& b) { } ``` -There's a key point here: `operator<` is written as a **templated free function** rather than a class-internal `friend`. The reason is that a class-internal `friend bool operator<(const safe_int& a, const safe_int& b)` only accepts two `safe_int` instances with the **same T**. But `safe_int < safe_int` is a comparison between two different template instances, and a class-internal friend simply can't match it. By writing it as a `template` free function, the compiler can correctly match this operator between `safe_int` and `safe_int`. `if constexpr` lets the compiler optimize away branches that aren't taken—zero overhead. Equality comparison and greater-than comparison follow the same approach; just write them the same way. +Here is a key point: `operator<` is written as a **templated free function** rather than a class member `friend`. The reason is that the class member `friend bool operator<(const safe_int& a, const safe_int& b)` only accepts two `safe_int` with the **same T**. And `safe_int < safe_int` is a comparison between two different template instances, the class friend can't match it at all. After writing it as a `template` free function, the compiler can correctly match this operator between `safe_int` and `safe_int`. `if constexpr` lets the compiler optimize away branches it doesn't take, zero overhead. Equality comparison, greater-than comparison follow the same idea, just write them accordingly. -Let's verify: +Verify: ```cpp int main() { @@ -638,13 +638,13 @@ int main() { ``` -## A Bigger Trap: Range Checks Silently Bypassed +## A Bigger Pit: Range Checks Silently Bypassed -Comparison operations are fixed, but there's an even more hidden scenario. The speaker gave an example with span—this pattern is extremely common in real code. +Comparison operations are fixed, but there is a more hidden scenario. The speaker gave a span example—this pattern is very common in actual code. -First, some background. `std::span` is essentially a "fat pointer"—a pointer to an element sequence plus the length of the sequence. This idea isn't new; Dennis Ritchie proposed adding pointers carrying boundary information to C as early as the early 1990s (for variable-length arrays), calling them fat pointers at the time, but the committee felt the runtime overhead was too large and didn't adopt them. Now C++20 has finally added span, a vindication decades overdue—although span itself doesn't do bounds checking, it provides a foundation for upper-level safety wrappers. +First, background. `std::span` is essentially a "fat pointer"—a pointer to a sequence of elements plus the length of the sequence. This idea isn't new, Dennis Ritchie proposed adding boundary-carrying pointers to C (for variable-length arrays) as early as the early 1990s, calling them fat pointers, but the committee felt the runtime overhead was too high and didn't adopt it. Now C++20 finally added span,算是 a vindication decades late—although span itself doesn't do boundary checks, it provides the foundation for upper-level safety wrappers. -So where's the problem? Look at this code: +Where is the problem? Look at this code: ```cpp #include @@ -658,15 +658,15 @@ void process(std::span data) { } ``` -`max_size` is `unsigned int`, with a value of 50. What happens with `50 - 500` under unsigned arithmetic? Underflow—it becomes a huge number (around `4294967296 - 450`). Then `subspan` receives this huge length—and `std::span::subspan` in C++20 does **not** have bounds checking; it only has a precondition (violating it is undefined behavior) and won't throw an exception. This means that huge number gets passed straight in, and the consequence is undefined behavior—it might read memory it shouldn't, it might happen not to crash, but you absolutely cannot rely on span to catch it for you. +`max_size` is `unsigned int`, the value is 50. What happens when `50 - 500` is calculated under unsigned arithmetic? Underflow, becoming a huge number (around `4294967296 - 450`). Then `subspan` gets this huge length—and `std::span::subspan` in C++20 **has no** boundary check, it only has a precondition (violation is undefined behavior), it won't throw an exception. This means that huge number is passed directly in, the consequence is undefined behavior—it might read memory it shouldn't, might not crash, but you can't rely on span to stop it. -All because of a tiny typo, all because of built-in type conversion rules, you completely lose the protection of range checking. Many people think span is safe enough, never expecting it to be bypassed at the parameter calculation layer. +Just because of a small slip, just because of built-in type conversion rules, you completely lose the protection of range checks. Many people think span is safe enough,没想到 it was bypassed at the parameter calculation layer. -## Adding Real Protection to span with safe_int +## Using safe_int to Give Span Real Protection -Now that we have safe_int, which can intercept all erroneous conversions, can we make span's size parameters protected too? Of course we can. +Now we have a safe_int that can intercept all wrong conversions, can we make span's size parameter protected too? Of course. -My approach is: first define a concept representing "types that can be used with span," and then require within this concept that the size type must be a safe integer. +My idea is: first define a concept representing "types that can be spanned", then require in this concept that the size type must be a safe integer. ```cpp #include @@ -710,23 +710,23 @@ struct safe_span { }; ``` -The key point is that the member variable `size_` has type `safe_int` instead of a bare `std::size_t`. This means any operation on this size—subtraction, comparison, assignment—will go through our safety checks. If someone writes `50 - 500`, safe_int will report an error the moment the operation happens, rather than letting a huge number quietly flow into subspan. **We don't need to patch things up in span's bounds checking; we need to eliminate erroneous values from the source—integer arithmetic itself.** Looking back, the approach is actually quite simple: replace unsafe built-in integers with safe wrapper types, so that errors are caught the moment they occur, rather than waiting for them to propagate to some bounds check before being discovered. In other words—let the class that should actually be responsible handle the corresponding errors, rather than having other components bail you out. +The key point is that the member variable `size_` is of type `safe_int` rather than a bare `std::size_t`. This means any operation on this size—subtraction, comparison, assignment—will go through our safety check. If someone writes `50 - 500`, safe_int will report an error at the moment of operation, rather than letting a huge number quietly flow into subspan. **We don't need to remedy this in span's boundary check, we need to eliminate the generation of wrong values at the source—the integer operation itself.** Looking back, the idea is actually simple: replace unsafe built-in integers with safe wrapper types, so errors are caught the moment they happen, not waiting for them to propagate to some boundary check. In other words—let the class truly responsible for handling handle the corresponding error, rather than letting other components cover for you. --- -# Adding Bounds Checking to span: From Manual Defense to Type Deduction +# Adding Boundary Checks to Span: From Manual Defense to Type Deduction -Array out-of-bounds access has always been a headache: it runs fast, but once you go out of bounds, the program might crash in some completely unrelated place, and then you stare at gdb for half an hour. Next, let's look at a structured approach to bounds checking for subscript access. +The problem of array out-of-bounds has always been a headache: it runs fast, but once it goes out of bounds, the program might crash in a completely unrelated place, and then you stare at gdb for half an hour. Next, let's look at a structured index out-of-bounds checking method. -## First, Let's Clarify What We Want to Do +## First, Clarify What We Want to Do -The core need is actually very simple: I have a contiguous memory region, I know how big it is, and I want to automatically check whether a subscript is out of bounds every time I use it to access the region. If it's out of bounds, throw an exception immediately or get blocked by the compiler, rather than waiting until memory is corrupted before I notice. +The core requirement is actually very simple: I have a contiguous memory area, I know how big it is, I want to automatically check if the index is out of bounds every time I access it with an index. If it's out of bounds, throw an exception immediately or be blocked by the compiler, rather than waiting for me to discover it after memory is corrupted. -Doesn't this sound like what `std::vector`'s `at()` does? But the difference is, I don't want to bear the cost of a dynamically allocated vector—I might just have a raw pointer plus a length, or a native array, and I want to access it in the same safe way. That's the whole point of span—it doesn't own the data, it just "views" the data, but while viewing, it can watch the boundaries for you. +Doesn't this sound like what `std::vector`'s `at()` does? But the difference is, I don't want to bear the overhead of a dynamically allocated vector, I might just have a raw pointer plus a length, or a native array, and I want to access it in the same safe way. This is the meaning of span—it doesn't own the data, it just "looks" at the data, but when looking, it can help you watch the boundaries. -## Writing a Checked Subscript Access +## Write a Checked Index Access by Hand -Let's start with the most basic scenario. Suppose I already have something of span type that internally holds data and size. What I need to do now is overload `operator[]` so that it performs a range check before executing the access. +Let's start with the most basic scenario. Suppose I already have a span-like thing, it holds data and size internally. What I need to do now is overload `operator[]` to make it check the range before executing the access. ```cpp #include @@ -763,7 +763,7 @@ public: }; ``` -You see, the constructor here only accepts a pointer and a size—this is what we call "spanable"—anything that can provide a data pointer and element count can be used to initialize it. Then `operator[]` does one thing: if the index you give is greater than or equal to size, throw an exception directly. +You see, the constructor here only accepts a pointer and a size, this is so-called "spanable"—anything that can provide a data pointer and element count can be used to initialize it. Then inside `operator[]`, one thing is done: if the index you give is greater than or equal to size, throw an exception directly. ## Run It and See the Effect @@ -786,18 +786,18 @@ int main() { } ``` -Running it produces this output: +Running it outputs this: ```text 3 捕获到异常: 下标越界了兄弟 ``` -At this point you might think, there's nothing special about this, isn't this just what `std::vector::at()` does? Don't worry, the key points are coming up. +At this point, you might think, this isn't special, `std::vector::at()` is just like this. Don't worry, the key point is later. -## The Negative Subscript Problem—The Signed/Unsigned Trap +## The Problem of Negative Indices—The Signed/Unsigned Pit -There's an easily overlooked trap here. `operator[]` accepts a parameter of type `std::size_t`, which is an unsigned integer. If you directly pass a `-10` in, what happens? +There is an easily overlooked trap here. `operator[]` accepts a parameter of type `std::size_t`, which is an unsigned integer. If you pass a `-10` directly, what happens? ```cpp // 你以为你在传 -10,其实编译器会做隐式转换 @@ -805,9 +805,9 @@ There's an easily overlooked trap here. `operator[]` accepts a parameter of type // s[-10] 实际上变成了 s[18446744073709551606] 之类的鬼东西 ``` -But! If you change the parameter type to a signed `ptrdiff_t`, the compiler can help you catch some obvious problems at compile time. Or rather, if you use `std::span`'s standard implementation, it has specific requirements for the subscript type. +But! If you change the parameter type to signed `ptrdiff_t`, the compiler can help you block some obvious problems at compile time. Or, if you use the standard implementation of `std::span`, it has specific requirements for the index type. -Let me rewrite it, changing the subscript type to signed so that negative numbers can be correctly identified: +Let me change the writing to make the index type signed, so negative numbers can be correctly identified: ```cpp template @@ -857,13 +857,13 @@ Output: 捕获到异常: 负数下标,你想干嘛 ``` -What's worth noting here is that when using `size_t` as the subscript type, a negative number passed in is implicitly converted to an astronomical figure, and then either it happens not to go out of bounds and reads garbage data (which is scarier), or it goes out of bounds and throws an exception but with a completely misleading error message. After changing to `ptrdiff_t`, a negative number is just a negative number—clear and unambiguous. +Here it's worth noting that when using `size_t` as the index type, a negative number passed in is directly implicitly converted to an astronomical number, then either it luckily doesn't go out of bounds and reads garbage data (scarier), or it goes out of bounds and throws an exception but the error message is completely misleading. After changing to `ptrdiff_t`, a negative number is just a negative number, clear and simple. -However, the compiler can only catch the simplest cases like literal negative numbers. In real engineering, the problems that actually occur are often values calculated elsewhere—some function returns a -1 to indicate failure, you forget to check it and use it directly as a subscript. This can only be caught at runtime, but at least with this check, the program won't silently corrupt memory. +However, the compiler can only block the simplest cases like literal negative numbers. In actual engineering, the real problems are often values calculated elsewhere—some function returns a -1 to indicate failure, someone forgets to check and uses it as an index. This can only be caught at runtime, but at least with this check, the program won't silently corrupt memory. -## Using Another span's Element as a Size—A More Realistic Scenario +## Using Another Span's Element as Size—A More Realistic Scenario -The speaker mentioned a very practical example: you use an element value from one span as a size parameter for another operation. You don't actually know what that value is, but unless it's a reasonable positive integer, it should be intercepted. +The speaker mentioned a very practical example: you use a value from one span as the size parameter for another operation. You don't actually know what that value is, but unless it's a reasonable positive integer, it should be blocked. ```cpp void process_with_dynamic_size(std::span params, std::span data) { @@ -917,11 +917,11 @@ Output: 捕获到异常: params[0] 不是合法的正整数 ``` -This pattern is particularly common in real projects. You get a number from a config file, a network protocol, or user input, and then use it to decide how many elements to access. Without checking, this is a perfect security vulnerability. +This kind of writing is particularly common in real projects. You get a number from a config file, network protocol, user input, and then use it to decide how many elements to access. Without checking, this is a perfect security vulnerability. -## Type Deduction: Stop Repeating What the Compiler Already Knows +## Type Deduction: Don't Repeat What the Compiler Already Knows -At this point, every time we have to write `checked_span`, `checked_span` repeating the element type, even though the compiler can obviously deduce it from the initialization arguments. This is exactly the problem that C++17's CTAD (Class Template Argument Deduction) was introduced to solve. Just add a deduction guide: +At this point, every time you have to write `checked_span`, `checked_span` repeating the element type, while the compiler can obviously deduce it from the initialization parameters. This is the problem that C++17's CTAD (Class Template Argument Deduction) aims to solve. Just add a deduction guide: ```cpp template @@ -982,15 +982,15 @@ int main() { } ``` -Type deduction might seem like "syntactic sugar," but after writing hundreds of span-related code items in a project, you'll find that writing one fewer `int` isn't about saving three characters—it's about when you later change `int` to `int64_t`, you only need to change it in one place, instead of searching everywhere for where you forgot to update. +Type deduction seems like "syntactic sugar", but after writing hundreds of span-related codes in a project, you'll find that writing one less `int` isn't about saving three characters, but when you change `int` to `int64_t` later, you only need to change one place,而不是 looking all over the world for where you missed writing. -This is a core philosophy of generic programming: don't repeat what the compiler already knows and what you already know. +This is a core philosophy of generic programming: don't repeat what the compiler already knows and you already know. -## Sub-spans and Construction from Pointers—A More Complete Toolbox +## Subspan and Construction from Pointers—A More Complete Toolbox -Having just one complete span isn't enough. In real development, you often need to slice a small piece from a large span, or construct a span from a raw pointer. +Just a complete span isn't enough. In actual development, you often need to cut a small piece from a large span, or construct a span from a raw pointer. -First, the scenario of constructing from a pointer. Since the whole point of span is safety, isn't constructing a span from a raw pointer inherently unsafe? There's indeed no way to check whether that pointer really points to that many elements—the compiler doesn't know, and there's no way to verify at runtime. But the key point is: **constructing a span from a pointer itself will look extremely conspicuous in code reviews and to static analysis tools**. If a project's standards require "all array access must go through span," then as soon as someone writes `span(ptr, n)` kind of code, the reviewer can see at a glance: there's an unsafe boundary here that needs close attention. This is much easier to manage than having `ptr[i]` scattered everywhere. +First, the scenario of constructing from a pointer. Since the meaning of span is safety, isn't constructing a span from a raw pointer inherently Unsafe? Indeed, there's no way to check whether that pointer really points to that many elements—the compiler doesn't know, and runtime can't verify it either. But the key is: **constructing a span from a pointer itself will appear extremely abrupt in code reviews and static analysis tools**. If a project standard requires "all array access must go through span", then writing `span(ptr, n)` code, the reviewer can see at a glance: here is an unsafe boundary, need to watch closely. This is much easier to manage than having `ptr[i]` everywhere. ```cpp #include @@ -1059,11 +1059,11 @@ Output: 捕获: take_front: n 超过了 span 的大小 ``` -Note how I write the bounds check in `take_range`: `count > s.size() - offset`. I didn't use `offset + count > s.size()` here, because the latter could overflow when signed and unsigned are mixed. Although in this scenario both `offset` and `count` are `size_t` and won't overflow, developing the habit of using subtraction rather than addition for range checks will save you from pitfalls elsewhere. This is also the approach mentioned in the talk of "using numbers rather than mixing signed and unsigned." +Note the way I wrote the boundary check in `take_range`: `count > s.size() - offset`. I didn't use `offset + count > s.size()` here because the latter might overflow when signed and unsigned are mixed. Although in this scenario `offset` and `count` are both `size_t` and won't overflow, developing the habit of using subtraction rather than addition for range checks can save you from pitfalls elsewhere. This is also the idea mentioned in the speech of "using numbers rather than mixing signed and unsigned". -Similarly, these helper functions can also have deduction guides added, so callers don't need to write template arguments. It's just two lines of deduction guides, but the code reads completely differently—you see `take_front(full, 3)`, not `take_front(full, 3)`. The compiler knows `full` is `span`, so it can deduce that the return value is also `span`; you don't need to worry about it for the compiler. +Similarly, these helper functions can also add deduction guides, so the call site doesn't need to write template parameters. Two lines of deduction guides, but the code reads completely differently—you see `take_front(full, 3)`, not `take_front(full, 3)`. The compiler knows `full` is `span`, it can deduce the return value is also `span`, you don't need to worry for it. -At this point, span's basic safe access, type deduction, and sub-span slicing are all sorted out. The code looks quite clean, with no unnecessary repetition, and checks are in place where they should be. But we're not done yet—there are more complex scenarios ahead. +At this point, span's basic safe access, type deduction, and subspan slicing are all figured out. The code looks quite clean, no redundant repetition, and checks where needed. But things aren't over—there are even more complex scenarios later. . But the question is: how do you know whether you're paying a cost? The compiler won't proactively tell you "this abstraction has overhead." It will silently generate code. And that code is assembly. +This is tied to the core philosophy of C++. From its inception, C++ has pursued one thing: you don't pay for what you don't use. But the question is, how do you know if you're paying a price? The compiler won't proactively tell you "this abstraction has a cost"; it will silently generate code. And that code is assembly. -The most direct way to understand what code is generated after a template is expanded is not to read compiler error messages (though that's important too), but to look at the generated assembly. When you see that a function instantiated from a template is perfectly inlined, loops are unrolled, and registers are allocated sensibly, you truly understand what "zero-overhead abstraction" means. Conversely, when you see a bunch of unnecessary function calls and memory shuffling, you immediately know where the problem lies. +The most direct way to understand what code is generated after template expansion is not to read compiler error messages (though that is important too), but to look at the generated assembly. When you see functions instantiated from templates perfectly inlined, loops unrolled, and registers allocated reasonably, you will truly understand what "zero-overhead abstraction" means. Conversely, when you see a bunch of redundant function calls and memory shuffling, you will immediately know where the problem lies. -So don't treat assembly as some mysterious, esoteric thing. It's simply a mirror reflecting what your C++ code actually looks like. You don't need to master it, but you need the ability to read its outline and know when something looks off. +So don't treat assembly as something mysterious. It is just a mirror reflecting exactly what your C++ code looks like. You don't need to master it, but you need to be able to read its outline and know when something looks wrong. --- -# Starting from "Writing Code by Hand": Why We Need to Understand the Low Level +# Starting from "Hand-Coding": Why We Need to Understand the Underlying Layers -The speaker mentioned the era of the ZX Spectrum and manually typing in code. For many people learning to program, compiling, running, and seeing that line of text in the terminal feels like enough. But a question quickly arises: you don't actually know how that line of text got to the screen, or even what the code turned into after compilation. This feeling of a "black box" might not matter when writing high-level abstractions, but once a bug appears — especially a weird memory-related one — you're left with nowhere to start. +The speaker mentioned the ZX Spectrum and the era of manually entering code. For many beginners, compiling, running, and seeing that line in the terminal feels like enough. But a problem quickly becomes apparent: you don't actually know how that line got to the screen, or even what the code turned into after compilation. This "black box feeling" might not matter when writing high-level abstractions, but once a bug appears—especially those weird memory-related bugs—you are helpless. -Learning to program isn't just about learning syntax, frameworks, or APIs. C++ syntax alone is enough to give anyone a headache — rvalue references, perfect forwarding, SFINAE. Just memorizing the names of these concepts, which are quite obscure to beginners, takes time. But the deeper you go, the more you run into an awkward truth: you don't truly understand what the code you write does at the machine level. When someone asks "how does the Hello World string get from the executable file to the CPU," and you can't answer, it means your understanding of the low level isn't solid enough. +Learning programming isn't just about learning syntax, frameworks, or APIs. C++ syntax alone is enough to give a headache—rvalue references, perfect forwarding, SFINAE. Just memorizing the names of these obscure concepts takes time for beginners. But the deeper you go, the more you encounter an awkward fact: you don't truly understand what your code is doing at the machine level. When someone asks "How does the 'Hello World' string get from the executable file to the CPU?", if you can't answer, it means your understanding of the underlying layer isn't solid enough. ## Hands-on: What Does C++ Code Actually Become? -Compiling your own C++ code into assembly and reading it line by line is the most direct way to understand "what the code is actually doing." +Compiling your C++ code into assembly and reading it line by line is the most direct way to understand "what the code is actually doing." -Experiment environment: Arch Linux WSL, GCC 16.1.1, with the `-S -O0` parameter added to the compile command. `-S` tells the compiler to only generate assembly and not proceed further, and `-O0` disables all optimizations, because with optimizations enabled the assembly gets transformed beyond recognition, making it very difficult for beginners to map it back to the source code. +Experiment environment: Arch Linux WSL, GCC 16.1.1, with `-S -O0` added to the compile command. `-S` tells the compiler to only generate assembly and not proceed further. `-O0` turns off all optimizations, because with optimizations enabled, the assembly is altered beyond recognition, making it hard for beginners to map back to the source code. -Let's write the simplest example: +Let's write a simplest example: ```cpp // demo.cpp @@ -69,7 +69,7 @@ Compile it: g++ -S -O0 -o demo.s demo.cpp ``` -Then open `demo.s`, and you'll see a huge amount of stuff. Don't panic — most of it is auxiliary information added by the compiler. We only care about the core parts. On x86-64, the assembly for the `add` function looks roughly like this: +Then open `demo.s`. You will see a huge pile of stuff. Don't panic; most of it is auxiliary information added by the compiler. We only care about the core part. Under x86-64, the assembly for the `add` function looks roughly like this: ```asm add(int, int): @@ -84,7 +84,7 @@ add(int, int): ret ; 返回 ``` -The part in the `main` function that calls `add`: +The part in `main` where `add` is called: ```asm main: @@ -100,11 +100,11 @@ main: ret ``` -When you see this assembly for the first time, you'll notice that under `-O0`, the compiler dutifully moves the parameters from registers to the stack, then reads them back from the stack to do the addition. It's not efficient, but this is the raw, unoptimized form — every line is crystal clear, and you can see exactly how the data flows. +When you see this assembly for the first time, you will notice: under `-O0`, the compiler honestly moves parameters from registers to the stack first, then reads them back from the stack to do addition. It's not efficient, but this is the original look without optimizations—every line is clear, and you can see how data flows. -## An Easy Trap to Fall Into +## A Common Pitfall -There's a trap here that must be mentioned. At first, I compiled with `-O1`, only to find that the assembly for the `add` function was just two or three lines. The parameters never even hit the stack — the computation was done entirely in registers (those familiar with compiler optimizations probably won't find this surprising — after all, it's an operation that can be handled at the register level, right!). This is because even `-O1` already performs register allocation optimization — the compiler realized there was no need to store the parameters on the stack and read them back, so it just used the registers directly. So if you want to follow along with the experiment, make sure to use `-O0`, otherwise you'll see a bunch of incomprehensible output. +There is a pitfall here I must warn you about. Initially, I used `-O1` to compile, only to find that the assembly for the `add` function was just two or three lines. The parameters never even hit the stack; the calculation was done directly in registers. (Friends familiar with compiler optimization probably won't feel anything about this—after all, it's something that can be operated on at the register level, right!). This is because `-O1` starts doing register allocation optimization—the compiler realized there's no need to store parameters to the stack and read them back, so it just used registers. So if you want to follow along with the experiment, make sure to use `-O0`, otherwise you will see a bunch of incomprehensible stuff. ```asm .file "demo.cpp" @@ -133,57 +133,57 @@ main: .section .note.GNU-stack,"",@progbits ``` -Another trap is that calling conventions differ across platforms. What's shown above is the x86-64 System V ABI, where the first two integer arguments are placed in `%edi` and `%esi` respectively, and the return value goes in `%eax`. If you compile with MSVC on Windows, the parameter passing method is different (it uses `%rcx`, `%rdx`). So if your results look different, check your platform and compiler first. +Another pitfall is that calling conventions differ by platform. The example above shows the x86-64 System V ABI, where the first two integer arguments are placed in `%edi` and `%esi`, and the return value is in `%eax`. If you compile on Windows with MSVC, the way parameters are passed is different (it uses `%rcx`, `%rdx`). So if the results look different, check your platform and compiler first. ## Why Understanding Assembly Helps You Understand C++ -After seeing this assembly, many things that previously seemed mystical become clear. For example, why is the performance difference between passing by value and passing by reference in C++ so large? Passing by value means copying data. If the object is large, the overhead of copying at the assembly level is instruction after instruction of `mov`, laid out right there in front of you. What about passing by reference? You're only passing an address — an 8-byte pointer. No matter how large the object is, you only pass 8 bytes. You might have "known" these principles before, but after seeing the assembly, you truly "understand" them. +After seeing this assembly, many things that previously seemed mysterious become clear. For example, why is the performance difference between passing by value and passing by reference in C++ so huge? Passing by value means copying data. If the object is large, the cost of copying at the assembly level is line after line of `mov` instructions, laid out clearly. Passing by reference? You just pass an address, an 8-byte pointer. No matter how big the object is, you pass 8 bytes. You might have "known" these principles before, but after seeing assembly, you "understand" them. -Take another example: why can inline functions improve performance? The `call` instruction itself has overhead — you need to save the return address, jump, and then jump back after the function returns. If the compiler expands the function body directly at the call site, all that overhead disappears. In the assembly, you won't see `call` or `ret` at all; the code just executes sequentially. +Another example is why inline functions improve performance: the `call` instruction itself has overhead—saving the return address, jumping, and jumping back after the function returns. If the compiler expands the function body directly at the call site, this overhead disappears completely. In the assembly, you won't see `call` or `ret`; the code just executes sequentially. -When you can see the machine instructions corresponding to every line of code, the concept of "performance" is no longer an abstract "fast" or "slow," but concrete: "these few instructions can be eliminated," or "these memory accesses can be merged." +When you can see the machine instructions corresponding to every line of code, the concept of "performance" is no longer an abstract "fast" or "slow", but concrete "these instructions can be saved" or "this memory access can be merged". ## Directions to Dig Deeper -Once you understand this layer, you'll naturally want to know: how does the linker stitch multiple object files together? What actually happens when a shared library is loaded? How does an operating system's system call switch from user mode to kernel mode? These aren't topics from "compiler theory" and "operating systems" textbooks that are irrelevant to your application code — they are the foundation. If the foundation isn't solid, everything built on top will wobble. +After figuring out this layer, you will naturally wonder: how does the linker stitch multiple object files together? What actually happens when a dynamic library is loaded? How do operating system system calls switch from user mode to kernel mode? These things aren't irrelevant content in "Compilers" and "Operating Systems" textbooks—they are the foundation. If the foundation is unstable, everything built on top will wobble. -If you've also had a vague sense about the low level, I suggest starting with "looking at assembly." You don't need to learn it deeply, and you don't need to be able to write assembly by hand. As long as you can "look at C++ code and roughly guess what the assembly looks like," your programming intuition will level up. +If you also have a vague feeling about the low-level, I suggest starting with "looking at assembly". You don't need to learn very deeply; you don't need to be able to write assembly by hand. As long as you can "see C++ code and roughly guess what the assembly looks like", your programming intuition will move up a level. -## What Exactly Is Assembly — Starting from the Birth of Compiler Explorer +## What Exactly is Assembly—Starting with the Birth of Compiler Explorer -Before diving into "digging deeper," there's a basic question worth answering: what exactly do we mean when we keep saying "assembly"? +Before figuring out "digging deeper", there is a basic question worth answering: what exactly do we mean by "assembly"? -The speaker was writing C++ at a company where the boss was very conservative and didn't allow any new C++ features. How conservative, exactly? They were debating whether they could use range-based for loops to replace the most primitive `for (int i = 0; i < sizeof(array); ...)` syntax. They had recently been burned by another programming language where the two approaches were indeed not equivalent, so the boss was particularly sensitive to "syntactic sugar." They ran a benchmark, and the results were ambiguous. The boss slammed the table: don't touch it. +The speaker was writing C++ at a company where the boss was very conservative and didn't allow using any new C++ features. How conservative? They were arguing whether they could use range-based for loops to replace the most primitive `for (int i = 0; i < sizeof(array); ...)` style. They had just been burned by another programming language where these two styles were indeed not equivalent, so the boss was very sensitive to "syntactic sugar". They ran a benchmark, but the results were ambiguous. The boss slammed the table: don't touch it. -The speaker didn't give up. He casually wrote a shell script that toggled compiler flags in the terminal, causing the assembly output to continuously refresh. Then he felt it was too messy, so he used regular expressions to do some replacement and formatting, and piped it through `c++filt` to demangle the symbol names that had been mangled beyond recognition. After finishing, he realized he could edit C++ code on the left in Vim and see the corresponding assembly output in real time on the right. +The speaker didn't give up. He casually wrote a shell script, switching compile options in the terminal, causing the assembly output to refresh continuously. Then he thought it was too messy, so he used regex to do some replacement and formatting, and piped it through `c++filt` to restore those symbol names mangled by name mangling. After finishing, he discovered: he could edit C++ code on the left in Vim and see the corresponding assembly output on the right in real-time. -This tool was the prototype of what later became the famous Compiler Explorer (godbolt.org). This story reveals a key insight: **even though we've been pursuing higher abstractions in C++, assembly remains super important to this language and to us.** Many developers feel that once they use C++17, `std::optional`, and `std::variant`, they no longer need to look at assembly — the compiler is smarter than they are, and the code it generates must be fine. But once they actually start looking at assembly, they discover that while the compiler is indeed smart, what it does often isn't what they assumed. +This tool was the prototype of the later famous Compiler Explorer (aka godbolt.org). This story reveals a key realization: **even though we constantly pursue higher abstractions in C++, assembly is still super important to this language and to us.** Many developers think that using C++17, `std::optional`, and `std::variant` means they don't need to look at assembly; the compiler is smarter than them, so the generated code must be fine. But only after actually looking at assembly do they realize that while the compiler is indeed smart, what it does is often different from what they assumed. -So what exactly is "assembly"? The dictionary meaning of "assembly" has several layers: it's a set of parts working together; it's the act or process of assembling a set of parts; it's a group of people gathered in one place for a purpose; it's a legislative body with ominous political connotations; in military terms, it's a drum signal calling troops to gather. And finally, the meaning we actually care about — it's the shortened form of assembly language. +So what exactly is "assembly"? The dictionary definition of "assembly" has several layers: it is a set of parts working together; it is the act or process of assembling parts together; it is a group of people gathered for a purpose; it is a legislature with ominous political overtones; in the military, it is a drum signal calling an army to gather. Finally, there is the meaning we actually care about—it is the shorthand form of assembly language. -In other words, when we keep saying "look at assembly," strictly speaking, we've been using the wrong term. We should say "look at assembly language." This might sound like a boring word game, but think about it — it actually makes sense. "Assembly" itself is an action, a process — putting parts together. "Assembly language" is the thing with concrete syntax, an instruction set, and opcodes. What the compiler does is indeed "assembly" — assembling the various parts of C++ (variables, functions, template instantiations) into the final machine code. And what we look at is that "assembly language" — the blueprint produced during the assembly process. +In other words, when we say "look at assembly", strictly speaking, we are using the wrong term. We should say "look at assembly language". This sounds like a boring word game, but think about it—it actually makes sense. "Assembly" itself is an action, a process—putting parts together. "Assembly language" is the thing with specific syntax, an instruction set, and opcodes. What the compiler does is indeed "assembly"—assembling the various parts of C++ (variables, functions, template instantiations) into the final machine code. What we look at is that "assembly language", the blueprint produced during the assembly process. -Once you understand this distinction, it becomes clear: what we're looking at is assembly language, the human-readable form of instructions that the CPU can understand, not some abstract "assembly process." And the reason assembly language is important to C++ programmers is that C++ abstractions have a cost (which is somewhat contradictory — we might be pursuing abstractions without cost, but that's the goal, not the actual result...), and this cost is completely invisible unless you look at it through assembly language. +Once this distinction is clear, we can understand: we are looking at assembly language, the human-readable form of instructions that the CPU understands, not some abstract "assembly process". The reason assembly language is important to C++ programmers is that C++ abstractions have a cost (paradoxically, we might be pursuing abstractions with no cost, but that is the goal, not the actual result...), and this cost is invisible without looking at assembly language. -Take the simplest example: a function on a hot path uses a `std::function` because you figure "the compiler will optimize it anyway." The result is a performance drop. Fire up Compiler Explorer and look at the assembly — the `std::function` call involves a virtual function dispatch, a heap allocation check, and a bunch of indirect jumps from type erasure. If you use a template parameter instead, the compiler inlines it directly — there isn't even a function call. You'd never know what happened if you didn't look at the assembly language. A benchmark can tell you "it got slower," but only assembly language can tell you "why it got slower." +Here is the simplest example: using a `std::function` in a function on a hot path, thinking "the compiler will optimize it anyway". The result was a performance drop. Looking at the assembly in Compiler Explorer—the call to `std::function` involved a virtual function dispatch, a heap allocation check, and a bunch of type-erased indirect jumps. If a template parameter was used directly, the compiler inlined it directly, with no function call at all. Without looking at assembly language, you would never know what happened. A benchmark can tell you "it got slower", but only assembly language can tell you "why it got slower". --- # From Assembly to C: A Forced Paradigm Jump -The talk mentioned a very representative experience: someone, without any computer science education, wrote a program entirely in assembly that included reference counting and even invented mark-sweep on their own. This isn't about some profound theory — it's a real person genuinely stumbling into problems, discovering them, and then "inventing" something that had already been invented. This process helps us understand where the concepts we later encounter in C++ actually came from. +The talk mentioned a very representative experience: someone, without any computer science background, wrote a program purely in assembly that included reference counting and even invented mark-sweep garbage collection themselves. This isn't about high theory; it's a real person stepping into real pitfalls, discovering problems, and then "inventing" something that had already been invented. This process helps us understand how the concepts we later encounter in C++ came to be. ## That "Monster" Written in Pure Assembly -Imagine this scenario: a person studying physics who knows nothing about computer science wants to write a fully windowed chat program. Not the kind where you type text and press Enter in a command line — one with a windowed interface, communicating over TCP, able to pause and send messages, formatting complex strings, and even supporting direct file transfers between clients. And it had a built-in scripting language of his own invention, inspired by BASIC, that supported dynamic allocation. +Imagine this scene: a person studying physics, knowing nothing about computer science, wants to write a full-windowed chat program. Not the kind where you type text and hit enter in a command line, but one with a windowed interface, communicating via TCP, capable of pausing to send messages, formatting complex strings, and supporting direct file transfer between clients. It even has a built-in scripting language of his own invention, inspired by BASIC, which supports dynamic allocation. -Many beginners' impression of assembly is writing interrupt handlers or startup code — a few dozen or a few hundred lines at most. But this program was page after page of assembly code, all posted on GitHub, with tag names so absurd they made you lose all sense of meaning. The most classic one was called `WombleLoopJedi` — you had no idea what it meant, but you could feel that the person writing the code had entered some kind of transcendent state. +Many beginners' impression of assembly is writing interrupt handlers or startup code, maybe dozens or hundreds of lines at most. But this program is page after page of assembly code, all hosted on GitHub, with tag names so ridiculous they lose all meaning—the most classic one being `WombleLoopJedi`—no idea what it means, but you can feel the person writing the code was in some kind of metaphysical state. -The most interesting part is what came next: he added dynamic allocation to the scripting language, then thought "reference counting is a good idea" and implemented it. Then he discovered the circular reference problem. So he came up with a complete line of reasoning — find the things that are no longer referenced and manually delete them. Years later, he mentioned this to a friend, and his friend said, "Oh, so you invented mark-sweep garbage collection." +The most interesting part is this: he added dynamic allocation to the scripting language, then thought "reference counting is a good idea", so he implemented reference counting. Then he discovered the circular reference problem. Then he came up with a complete idea—find those things that are no longer referenced and manually delete them. Years later, chatting with a friend, the friend said, "Oh, so you invented mark-sweep garbage collection." -This is pure thinking without the constraints of textbooks. He didn't know it was called mark-sweep, but starting from the problem, he step by step deduced the correct solution. Mark-sweep wasn't an algorithm someone pulled out of thin air — it's the natural deduction for solving the specific problem of "reference counting can't handle circular references." +This is pure thinking without the constraints of textbooks. He didn't know it was called mark-sweep, but starting from the problem, he step-by-step derived the correct solution. Mark-sweep wasn't an algorithm someone came up with out of thin air; it is the natural derivation for solving the specific problem "reference counting can't handle circular references". -We can use a simplified pseudocode to reconstruct this thought process, which is much clearer than just explaining the concept: +We can use a simplified pseudocode to reconstruct this thought process, which is much clearer than just explaining concepts: ```cpp // 第一阶段:引用计数(能想到的第一步) @@ -211,7 +211,7 @@ void release(Object* obj) { // 它们永远不会被释放 —— 这就是循环引用 ``` -Since the reference count can never reach zero, let's switch perspectives — instead of starting from "how many things reference me," start from "can anything still reach me?" If it can be reached, it's alive; if it can't be reached, it's dead. Delete the dead ones. This is the core idea of mark-sweep: mark is for tagging what can be reached, and sweep is for cleaning up what can't be reached. +Since reference counting can't reach zero, let's change the angle—instead of starting from "how many things reference me", start from "is there anything that can still reach me". Those that can be reached are alive; those that cannot are dead, and the dead ones are deleted. This is the core idea of mark-sweep. Mark marks the reachable, sweep sweeps away the unreachable. ```cpp // 第二阶段:他"发明"的 mark-sweep(概念还原) @@ -264,39 +264,39 @@ void garbage_collect() { } ``` -The logic really isn't complicated. Garbage collection might look like black magic, but when you还原 it to this scenario — a person writing a scripting language who needs to manage memory, finds that reference counting isn't enough, and so switches to a different approach — it becomes very natural. The key isn't how elegant the algorithm is, but whether you can get there starting from a real problem. +Logically, it's really not complex. Garbage collection looks like black magic, but reducing it to this scenario—a person writing a scripting language, needing to manage memory, reference counting isn't enough, so change the angle—it becomes very natural. The key isn't how clever the algorithm is, but whether you can get to this point starting from a real problem. -## From Assembly to C: A Forced Turning Point +## From Assembly to C: A Forced Turn -This person had been writing everything in assembly, and assembly had been his companion all along. Until one day, he wanted to run a multi-user dungeon — a MUD. +This person kept writing things in assembly, and assembly stayed with him all the way. Until one day, he wanted to run a Multi-User Dungeon, a MUD. -A MUD is a purely text-based multiplayer online RPG with no graphical interface. Everything is described in text. When you log in, you see things like "You stand at a crossroads. To the north is a castle, to the east is a forest." Type "go north" to go north, type "attack goblin" to fight a goblin. You can team up with friends, fight monsters, and cast spells. It's essentially an online multiplayer text version of Dungeons & Dragons. +A MUD is a purely text-based multiplayer online RPG with no graphical interface; everything is described in text. You log in and see "You are standing at a crossroads. To the north is a castle, to the east is a forest." You type "go north" to go north, "attack goblin" to hit a goblin. You can team up with friends, fight monsters, cast spells—essentially it's the online multiplayer version of "Dungeons & Dragons" in text. -The problem was, he couldn't write an entire MUD from scratch by himself. It was too big — even for someone who could write thousands of pages of assembly. So he found some source code circulating online, with a permissive license, ready to use. There's an important historical context to note here: there was no GitHub, or anything like it, back then. The way people shared code was by passing around tarballs — `.tar.gz` compressed archives, usually on IRC, directly from person to person. You'd shout in an IRC channel, "Does anyone have the MUD source code?", and someone would send you a compressed file via DCC. You'd get the archive and start tinkering. No version control, no issue tracker, no pull requests — just raw code files. +The problem was, he couldn't write a whole MUD from scratch by himself. It was too big, even for someone who could write thousands of pages of assembly. So he found some source code online, the license was fine, and he could use it directly. Note the historical context here: there was no GitHub then, nor any similar platform. The way people shared code was passing tarballs—those `.tar.gz` compressed archives, usually on IRC, transferring files directly from person to person. Shouting in an IRC channel "Who has the MUD source code?", then someone sends a compressed file via DCC, and you get the archive and start tinkering. No version control, no issue tracker, no pull requests, just naked code files. -And those MUD source codes were written in a programming language called C. This was the turning point. A person who had written thousands of pages of assembly was now facing C source code. He had to learn C, otherwise he couldn't modify that MUD. This wasn't the motivation of "I want to learn a new language" — it was the motivation of "I must understand this code to do what I want to do." +And those MUD source codes were written in a programming language called C. This was the turning point. A person who had written thousands of pages of assembly was now facing a piece of C language code. He had to learn C, otherwise he couldn't modify that MUD. This wasn't the motivation of "I want to learn a new language", but "I must understand this code to do what I want to do". -Jumping from assembly to C might not seem like a big deal today, but at the time, it was actually a huge paradigm jump. In assembly, you manipulate registers, memory addresses, and interrupts. In C, you start using abstract concepts like variables, functions, and structs. For someone who had only used assembly, the idea that "the compiler handles the stack frame for you" was something that required adjustment. But on the flip side, precisely because he came from assembly, his intuitive understanding of how C code runs at the low level might have been better than many people with formal CS degrees — because he knew exactly what kind of machine instructions those C statements would ultimately become. +Jumping from assembly to C might not seem like much today, but at the time, it was a huge paradigm jump. In assembly, you manipulate registers, memory addresses, and interrupts. In C, you start using abstract concepts like variables, functions, and structs. For someone who always used assembly, the idea that "the compiler handles the stack frame for you" required adaptation. But conversely, because he came from assembly, his intuitive understanding of how C code runs at the bottom level might be better than many CS graduates—because he knows what machine instructions those C statements eventually turn into. -Sometimes what drives us forward isn't a systematic study plan, but a project you really want to build that your current toolchain simply can't handle. +Sometimes what drives us forward is not a systematic study plan, but a specific project we really want to do but can't handle with our current toolchain. --- # From Assembly to C++: Why We Need High-Level Languages -The speaker mentioned writing programs in pure assembly at age 15 to submit to magazines for money. From this background, we can understand one thing: why the C++ language is designed the way it is, and why it has so many "seemingly redundant" layers of abstraction. +The speaker mentioned he wrote programs in pure assembly at 15 to submit to magazines for money. From this background, we can understand one thing: why the C++ language is designed the way it is, and why it has so many "seemingly superfluous" layers of abstraction. -If you look back from the perspective of assembly, many design decisions aren't "deliberately obscure" — they were "forced into existence." +If you look back from the perspective of assembly, many design decisions aren "deliberately mysterious", but "forced out". -## The Real Experience of Assembly Programming +## The Practical Experience of Assembly Programming -Writing a program that "reads two numbers from standard input and adds them" takes nearly 50 lines in x86 assembly. You have to manage stack alignment yourself, set up system call numbers yourself, and handle buffers yourself. The speaker said the programs he wrote at 15 were published in magazines as 20 densely packed pages of small print. Type one punctuation mark wrong, and the program blows up. Then you have to find that error across 20 pages of printed text. +Writing a program that "reads two numbers from standard input and adds them" takes nearly 50 lines in x86 assembly, plus you manage stack alignment yourself, fiddle with system call numbers yourself, and handle buffers yourself. The speaker said the programs he wrote at 15 were published in magazines, 20 pages of tiny text densely packed. Type one punctuation mark wrong, the program crashes, and then you have to find that error in 20 pages of print. -Once you understand many of C++'s mechanisms, your mindset completely changes. It's no longer "yet another piece of syntax to memorize," but "look at how much trouble this thing saves me." +Understanding many of C++'s mechanisms completely changes your mindset. It's not "another syntax to memorize", but "how much trouble this thing saved me". -## The Same Logic: How Much Difference Between Assembly and C++? +## How Different Are Assembly and C++ for the Same Logic? -Let's look at a particularly simple example — calling a function, passing a parameter, and getting a return value. This operation is trivial in C++, but a lot happens at the assembly level. +Let's look at a very simple example—calling a function, passing a parameter, and getting a return value. This operation is nothing in C++, but a lot happens at the assembly level. ```cpp // simple_call.cpp @@ -311,13 +311,13 @@ int main() { } ``` -Compile it and look at the assembly output (I'll describe my environment later): +Compile and look at the assembly output (I'll discuss my environment later): ```bash g++ -O0 -S simple_call.cpp -o simple_call.s ``` -`-O0` disables all optimizations, because with optimizations enabled the compiler will fold the entire thing into a constant, and we won't be able to see the function call process. Open `simple_call.s`, and you'll see something like this (I've extracted the key parts, AT&T syntax): +`-O0` turns off all optimizations, because with optimizations on, the compiler will fold the whole thing into a constant, and we won't see the function call process. Open `simple_call.s`, and you will see something like this (I've captured the key part, AT&T syntax): ```asm add(int, int): @@ -344,11 +344,11 @@ main: ret ``` -For just one `add(3, 4)`, at the assembly level you need to worry about: how the stack frame is set up, which register the parameter is passed through (the x86-64 System V calling convention uses rdi/rsi/rdx/rcx/r8/r9 for the first six integer arguments), where the return value is placed, and how the stack is restored after the call. In C++, writing one line of code handles all of this — the compiler does it all for you. +Just for one `add(3, 4)`, at the assembly level you have to care about: how the stack frame is built, which register the parameter is passed through (x86-64 System V calling convention is rdi/rsi/rdx/rcx/r8/r9 for the first six integer arguments), where the return value is placed, and how the stack is restored after the call. In C++, writing one line of code handles all this; the compiler does it all for you. -## Going Further: When the Parameter Isn't a Simple Integer +## Going Further: When Parameters Aren't Simple Integers -The example above is too simple, so let's try passing a string. This involves pointers and memory layout. +The example above is too simple. Let's try passing a string. This involves pointers, memory layout, and such. ```cpp // string_call.cpp @@ -376,53 +376,53 @@ int main() { } ``` -This C++ code looks straightforward. But to write this logic in assembly by hand, you'd have to calculate the address offsets of `src` and `dst` yourself, handle the loop counter yourself, determine character ranges yourself, and append the null terminator yourself. And the most fatal part — if you miscalculate an offset, the program won't tell you "you have an array out-of-bounds error." It will either silently corrupt other data or simply crash with a segfault. +This C++ code looks straightforward. But to write this logic in assembly by hand, you have to calculate address offsets for `src` and `dst` yourself, handle loop counters yourself, judge character ranges yourself, and pad the terminator yourself. And the most deadly thing is—if you calculate an offset wrong, the program won't tell you "you array out of bounds"; it will either silently corrupt other data or just segfault and crash. -So looking at these C++ designs again, you get a moment of sudden clarity: +So looking at these designs in C++ again, you get an epiphany: -**Why do references exist?** Because passing pointers is too error-prone — null pointers, dangling pointers, miscalculated offsets. Semantically, a reference means "this thing definitely points to a valid object," and the compiler helps you hold that baseline. +**References** Why do they exist? Because passing pointers is too error-prone: null pointers, dangling pointers, miscalculating offsets. References semantically mean "this thing definitely points to a valid object", and the compiler helps you guard this bottom line. -**Why does `std::string` exist?** Because raw character arrays with manual length management are a breeding ground for the kind of disaster described above. You don't have to use `std::string`, but then you have to guarantee that every single place correctly handles length, null terminators, copying, and destruction. +**`std::string`** Why does it exist? Because bare char arrays plus manual length management are the breeding ground for the disaster above. You don't have to use `std::string`, but you have to guarantee that every single place correctly handles length, terminators, copying, and destruction. -**Why was `std::string_view` added in C++17?** Because sometimes you just want to read a string without copying it, but passing a `const std::string&` to a `const char*` triggers implicit construction of a `std::string` temporary object. `string_view` is a lightweight "look but don't touch" view — under the hood it's just a pointer plus a length, but its semantics are much clearer than a raw `const char*` + `size_t`. +**`std::string_view`** Why did C++17 add it? Because sometimes you just want to read a string without copying, but passing `const std::string&` into `const char*` triggers an implicit `std::string` temporary object construction. `string_view` is a lightweight "I look but don't touch" view; underneath it's just a pair of pointers plus a length, but the semantics are much clearer than bare `const char*` + `size_t`. -If you've never written assembly or been tortured by pointers and memory layout, you might think these things are "unnecessary." But if you have been tortured by them, you think "thank goodness someone figured this out for us." +If you haven't written assembly and haven't been tortured by pointers and memory layout, you might think these are "gilding the lily". But if you have been tortured, you think "thank god someone figured this out for me". -## Environment Notes +## Environment Description The environment for running these examples is as follows, for easy reproduction: - Environment: Arch Linux WSL, GCC 16.1.1 -- Assembly syntax: GCC's default AT&T syntax (the one where operand order is reversed compared to Intel syntax, `%rax` instead of `rax`, `movq 源, 目的` instead of `mov 目的, 源`) +- Assembly syntax: GCC's default AT&T syntax (the one where operand order is reversed from Intel syntax, `%rax` instead of `rax`, `movq 源, 目的` instead of `mov 目的, 源`) - If you want to see Intel syntax, just add the `-masm=intel` parameter: `g++ -O0 -S -masm=intel simple_call.cpp` ## Why Someone Would Write an IRC Client -The speaker mentioned that he later switched to an Archimedes computer, with an ARM processor, and there was no ready-made IRC client, so he wrote one himself. +The speaker mentioned he later switched to an Archimedes computer, with an ARM processor, and there was no ready-made IRC client, so he wrote one himself. -This mindset of "I need a tool, there's no ready-made one available, so I'll build one myself" is very common in practical programming learning. Because only when you really need to "build something" do you encounter problems that tutorials won't tell you about: `std::getline` behaving inconsistently in certain terminals; `std::ofstream` handling newlines differently across platforms; using `std::string` to store Chinese characters, where `length()` returns the number of bytes, not characters. If you're just following a tutorial typing "Hello World," you'll never run into these things. But when you really want to write "something that works," they all pop up. The 15-year-old who wrote the IRC client in the talk was the same way. He didn't learn all the network programming knowledge first and then start coding. He thought, "I want to get on IRC, but I don't have a client, so I'll write one." Knowledge doesn't come from textbooks — it grows from the desire to "do this thing." +This mindset of "I need a tool, but there isn't one, so I'll build one" is very common in actual programming learning. Because when you really need to "build something", you encounter problems tutorials won't tell you about: `std::getline` behaves inconsistently under certain terminals; `std::ofstream` handles newlines differently on different platforms; using `std::string` to store Chinese, `length()` returns bytes not characters. If you just follow tutorials typing "Hello World", you'll never hit these. But when you really want to write "something that works", they all pop up. The 15-year-old who wrote the IRC client in the talk was the same. He didn't learn all network programming knowledge before starting; he thought "I want to get on IRC, but I don't have a client, so I'll write one". Knowledge doesn't come from textbooks; it grows from the desire of "I want to do this". -## From "Hand-Writing Everything" to "Leveraging Abstractions" +## From "Hand-Coding Everything" to "Leveraging Abstractions" -C++ is essentially a language that "lets you choose which level to work at." +C++ is essentially a language that "lets you choose which level to work at". -Want to control memory manually? Go ahead — pointers, `new`/`delete`, placement new, and memory alignment attributes are all wide open to you. Want the compiler to manage it for you? Go ahead — smart pointers, RAII, containers, `std::string`, no need to worry about deallocation. Want to compute things at compile time? Go ahead — `constexpr`, templates, and concepts let you shift runtime overhead to compile time. Want to write generic code? Go ahead — templates let you write one piece of code for various types, and concepts let you check type constraints at compile time. +Want to control memory manually? You can—pointers, `new`/`delete`, placement new, memory alignment attributes, all open to you. Want the compiler to manage it for you? You can—smart pointers, RAII, containers, `std::string`, don't worry about freeing. Want to calculate things at compile time? You can—`constexpr`, templates, concepts, move runtime overhead to compile time. Want to write generic code? You can—templates let you write one code for various types, concepts let you check type constraints at compile time. -These levels don't replace each other; they can be mixed. In the same program, you can use raw pointers for high-performance memory operations at the low level, and `std::vector` and `std::string` for safe data management at the high level. This kind of flexibility was unimaginable in the pure assembly era — back then there was only one level: "do everything yourself." +These levels aren't mutually exclusive; they can be mixed. You can be in the same program, using raw pointers at the bottom for high-performance memory operations, and using `std::vector` and `std::string` at the top for safe data management. This flexibility was unimaginable in the pure assembly era—back then there was only one level: "do everything yourself". -This explains C++'s design philosophy — "you don't pay for what you don't use." Because the origin of this language was a group of people who had been tortured enough by assembly and wanted a language that "could control the low level without hand-writing every low-level detail." It didn't fall from the sky — it was forced into existence by need. Once you connect this historical thread with the language design, many previously "baffling" designs suddenly make perfect sense. +This explains C++'s design philosophy—"you don't pay for what you don't use". Because the background of the language's creation was a group of people tortured by assembly who wanted a language that "could control the low level but didn't require hand-writing every low-level detail". It didn't fall from the sky; it was forced out by need. Connecting this history with language design, many designs that previously seemed "baffling" suddenly become logical. --- -# From "Assembly Is the Only Solution" to "The Compiler Can Actually Do Work" +# From "Assembly is the Only Solution" to "The Compiler Can Actually Do the Work" -The talk mentioned the experience of "every time you switch computers, it's a different OS and a different architecture." After the MUD was banned by the admin and he was forced to switch machines, what did that mean in that era? It meant that your hand-written assembly code couldn't run a single line on a completely different CPU. The reason for writing the MUD in C instead of assembly was very pragmatic — rewriting assembly every time you switched machines was simply not feasible. Although C compilers on different machines in that era might themselves behave differently, C was still vastly superior to assembly because the benefits were too great. In his words, "rewriting it in assembly was simply not feasible" — this isn't some profound software engineering theory, it's the instinctive choice after being beaten down by reality. +The talk mentioned the experience of "every time I switch computers, it's a different OS and architecture". Back when the MUD was banned by the admin and he was forced to switch machines, what did that mean in that era? It meant your hand-written assembly code wouldn't run a single line on a completely different CPU. Writing the MUD in C instead of assembly was for a very simple reason—rewriting assembly every time you switched machines was simply impossible. Although C compilers on different machines in that era might behave differently, C was still way better than assembly because the benefits were huge. In his words, "rewriting in assembly is simply impossible"—this isn't some high software engineering theory, just an instinctive choice after being beaten by reality. -## Hands-on Verification: How Much Difference in Cross-Platform Cost Between Assembly and C for the Same Logic? +## Hands-on Verification: How Much Difference is There in Cross-Platform Costs Between Assembly and C for the Same Logic? -Let's write a minimal example to feel this difference. Suppose we want to implement a feature: reverse data in a block of memory byte by byte. This operation is actually quite common in game development, for example when handling cross-platform little-endian/big-endian data. +Let's write a minimal example to feel this difference. Suppose we want to implement a feature: reverse data in a segment of memory by byte. This operation is actually common in game development, for example, handling cross-platform little-endian/big-endian data. -First, using the pure assembly approach (taking x86_64 as an example, with GCC inline assembly): +First, let's write it using pure assembly thinking (taking x86_64 as an example, using GCC inline assembly): ```cpp // reverse_asm.cpp @@ -476,7 +476,7 @@ int main() { } ``` -The inline assembly above has a classic register conflict error — `rdx` is used simultaneously as a pointer and temporary storage. This is the most typical pitfall in hand-written assembly. Even if you fix this bug, this code can only compile in an x86_64 + System V ABI environment. Want to run it on ARM? Sorry, the instruction set is completely different, the register names are different, and the calling convention is different — it's like starting from scratch. +The inline assembly above has a classic register conflict error—`rdx` is used as both a pointer and temporary storage, which is the most typical pitfall of hand-written assembly. Even if you fix this bug, this code can only compile in an x86_64 + System V ABI environment. If you want to run it on ARM? Sorry, the instruction set is completely different, register names are different, and the calling convention is different—start writing from scratch. Now let's write the same logic in pure C++: @@ -522,13 +522,13 @@ int main() { } ``` -This C++ code looks too simple — what's there to compare? But that's exactly the key point — choosing C over assembly isn't because C can write more complex algorithms, but because for this kind of "simple logic," when switching platforms, the C version only needs to be recompiled, while the assembly version needs to be rewritten. When a project has hundreds of these "simple logics," this gap is the fundamental difference between "portable" and "not portable." +This C++ code looks too simple, what is there to compare? But the key point is here—choosing C over assembly isn't because C can write more complex algorithms, but because this "simple logic" only needs recompiling when switching platforms, whereas the assembly version needs rewriting. When a project has hundreds of these "simple logics", this gap is the fundamental difference between "portable" and "not portable". -## In the 90s, Compilers Weren't Good Enough, So You Had to Write Assembly by Hand — But It's 2026 Now +## Compilers in the 90s Were Bad, So You Had to Write Assembly by Hand—But Now It's 2026 -The talk mentioned a crucial piece of historical context: in the 90s and early 2000s, compilers weren't smart enough. CPUs had many special instructions for games (like the PS2's VU instructions, or the Dreamcast's SH4 extensions), and compilers had no idea how to generate these instructions, so you had to write assembly by hand. This logic still holds today, just in different forms. For example, writing NEON instructions on ARM for SIMD acceleration, or writing GPU kernels with CUDA, is essentially "the compiler (still) can't automatically generate optimal code for you, so you have to specify it manually." The difference is that these scenarios are far fewer today than back then, and compilers are improving rapidly. +The talk mentioned a very key historical background: in the 90s and early 2000s, compilers weren't smart enough. CPUs had many special instructions for games (like PS2's VU instructions, Dreamcast's SH4 extensions), and compilers didn't know how to generate these instructions at all, so you had to write assembly by hand. This logic still holds today, just the form has changed. For example, writing NEON instructions on ARM for SIMD acceleration, or writing GPU kernels in CUDA, is essentially "the compiler (still) can't automatically generate optimal code for you, so you have to specify it manually". The difference is that these scenarios are much rarer today than back then, and compilers are improving rapidly. -Let's look at a comparison experiment: the same matrix multiplication, run with both a pure C++ loop and hand-written AVX2 inline assembly: +Let's look at a comparison experiment, the same matrix multiplication, running with pure C++ loops versus hand-written AVX2 inline assembly: ```cpp // matmul_test.cpp @@ -615,10 +615,10 @@ int main() { } ``` -On an x86_64 machine (GCC 16.1, `-O3 -mavx2 -mfma`), the results are roughly: the scalar version around 15ms, the manual AVX2/FMA version around 3ms, with a speedup of about 5x. But here's the key: if the scalar version is also compiled with `-O3 -mavx2 -mfma`, GCC's auto-vectorization can optimize it to about 4ms. In other words, after all that effort writing AVX2/FMA intrinsics by hand, it was only about 25% faster than what the compiler generated automatically. +On an x86_64 machine (GCC 16.1, `-O3 -mavx2 -mfma`), the result is roughly: scalar version about 15ms, AVX2/FMA manual version about 3ms, speedup about 5x. But the key is, if the scalar version is also compiled with `-O3 -mavx2 -mfma`, GCC's auto-vectorization can optimize it to about 4ms. That is, hand-writing AVX2/FMA intrinsics for a long time only yielded about a 25% speedup over the compiler's auto-generated code. -::: details Actual verification results (Arch Linux WSL, GCC 16.1.1, -O3 -mavx2 -mfma) -In the verification environment, because GCC 16.1's auto-vectorization capability is already very strong, the scalar version was automatically optimized by the compiler to near the level of manual AVX2/FMA, with an actual speedup of only about 1.16x: +::: details Actual Verification Results (Arch Linux WSL, GCC 16.1.1, -O3 -mavx2 -mfma) +In the verification environment, due to GCC 16.1's strong auto-vectorization capabilities, the scalar version was automatically optimized by the compiler to close to the manual AVX2/FMA level, with an actual speedup of only about 1.16x: ```text scalar: 1.09 ms @@ -627,16 +627,16 @@ speedup: 1.16x max_diff: 0.000000e+00 ``` -This further reinforces the article's core argument: modern compilers' auto-vectorization is getting stronger and stronger, and the benefits of hand-writing SIMD are shrinking. Specific numbers vary by hardware and compiler version, but the trend is consistent. +This further confirms the article's core point: modern compilers' auto-vectorization is getting stronger, and the benefits of hand-writing SIMD are shrinking. Specific numbers vary by hardware and compiler version, but the trend is consistent. -Verification code: `code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/02-00-matmul-test.cpp` +Verification code: [02-00-matmul-test.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/02-00-matmul-test.cpp) ::: -This is the difference between 2026 and the 90s. In the 90s, compilers had no idea what SIMD was, and hand-written assembly might be 10x faster. Today, compilers are already quite smart, the benefits of hand-writing are shrinking, but the costs (readability, maintainability, portability) remain just as high. +This is the difference between 2026 and the 90s. In the 90s, compilers had no idea what SIMD was, and hand-writing assembly might be 10x faster; today, compilers are quite smart, and the benefits of hand-writing are getting smaller, but the cost (readability, maintainability, portability) remains huge. -## The Tools Change, But the Pattern of "Being Driven to Learn by Reality" Never Does +## Tools Change, But the "Learning Driven by Reality" Mode Has Never Changed -Returning to the talk's core thread: from assembly to C, from C to C++, none of these steps happened because "the new language is cooler," but because "the old approach couldn't hold up under new constraints." C was chosen because of the need for cross-platform portability. C++ was embraced because it turned out C could do far more than just serve as a "fancy macro assembler." From this historical thread, we can draw a simple realization: **the choice of tool depends on what the current biggest pain point is.** The pain point was "having to rewrite everything every time you switch machines," so C was chosen. Later, the pain point became "wanting to do more complex things but C was too cumbersome to express them in," so C++ was embraced. The tools change, but the pattern of "being driven to learn by reality" never does. +Returning to the core thread of the talk: from assembly to C, from C to C++, every step wasn't because "the new language is cooler", but because "the old solution couldn't hold up under new constraints". Choosing C was for cross-platform compatibility. Accepting C++ was discovering that C could do much more than just "macro assembler" work. From this historical thread, we get a simple realization: **the choice of tool depends on what the current biggest pain point is**. The pain point was "rewriting every time I switch machines", so we chose C. Later the pain point became "wanting to do more complex things but C is too hard to express", so we accepted C++. Tools change, but the mode of "being driven to learn by reality" has never changed. , we quickly find that assembly can actually be "half-read, half-guessed." We don't need to know how to write it to get the gist of it. +Faced with a screen full of ``mov``, ``add``, and ``jmp`` paired with a bunch of unintelligible register names, a beginner's first reaction is often to close the tab. When a template errors out, we can at least search Stack Overflow, but assembly output looks like gibberish—it is hard to know where to start. However, with some targeted experiments using Compiler Explorer, we discover that assembly can actually be understood by "half-reading, half-guessing"—we don't need to truly know how to write it. -## Setting the Stage +## Clarifying the Environment First -All the experiments below were done on Compiler Explorer (godbolt.org). For compilers, we used GCC 16.1.1 for x86-64, the aarch64 version of GCC 16.1.1 for ARM64, and the riscv64 version of GCC 16.1.1 for RISC-V. The operating system is consistently set to Linux, because the calling convention on Windows is different, which leads to different assembly output—we'll discuss this in detail later. We primarily look at the ``-O2`` optimization level, occasionally switching to ``-O0`` for comparison, and we'll explain why later. +All experiments below are performed on Compiler Explorer (godbolt.org). Regarding compilers, x86-64 uses GCC 16.1.1, ARM64 uses the aarch64 version of GCC 16.1.1, and RISC-V uses the riscv64 version of GCC 16.1.1. The operating system is uniformly set to Linux because the calling conventions on Windows are different, which leads to variations in assembly output—this will be discussed in detail later. The optimization level primarily focuses on ``-O2``, occasionally switching to ``-O0`` for comparison, the reasons for which will be explained later. -## Starting with the Simplest Possible Function +## Start with the Simplest Function -To understand what assembly actually looks like across different architectures, we start with the most basic ``square`` function—taking an input integer, multiplying it by itself, and returning the result. The more plain the function, the better it is for observing compiler behavior: the logic is simple, the assembly is short, and the purpose of every single instruction is obvious. +To understand what assembly actually looks like under different architectures, we start with the simplest ``square`` function—taking an input integer, multiplying it by itself, and returning it. The simpler the function, the more suitable it is for observing compiler behavior, because the logic is simple and the assembly is short, making the role of each instruction clear at a glance. -```cpp +````cpp int square(int x) { return x * x; } -``` +```` -Intuitively, regardless of the CPU architecture, since they all do the exact same thing, the compiled assembly should look roughly the same. But when we place the three architectures side by side in Compiler Explorer, we find they look completely different—the instruction formats, register names, and even the implementation of multiplication are all different. But upon closer inspection, a key pattern emerges: although they look different, the skeleton is exactly the same—fetch the parameter from somewhere, perform the operation, and put the result in an agreed-upon location to return. Once we understand this skeleton, reading assembly is no longer intimidating. +Intuitively, regardless of the CPU architecture, since the same task is being performed, the compiled assembly should be roughly similar. However, when we place the three architectures side-by-side in Compiler Explorer, we find they look completely different—instruction formats, register naming, and even the implementation of multiplication vary. But upon closer observation, a key pattern emerges: although their "appearances" differ, their skeleton is actually the same—fetch parameters from somewhere, perform an operation, and place the result in an agreed-upon location for return. Once we understand this skeleton, reading assembly is no longer intimidating. ## The x86-64 Version -Let's look at x86-64 first, since most development machines run this architecture. Under ``-O2`` optimization, GCC generates the following code: +Let's look at x86-64 first, as most development machines run this architecture. Under ``-O2`` optimization, GCC generates the following code: -```asm +````asm square(int): imul edi, edi mov eax, edi ret -``` +```` -Seeing this code for the first time might raise a question: shouldn't the parameter be on the stack? Why is it being read directly from ``edi``? This is dictated by the System V AMD64 ABI (the calling convention for x86-64 on Linux)—the first few integer parameters of a function are passed via registers, with the first parameter in ``edi`` and the return value in ``eax``. So the meaning of these three instructions is quite clear: ``imul edi, edi`` is the two-operand multiplication form in x86— the left operand is both the source and the destination, multiplying the value in ``edi`` by itself and writing the result back to ``edi``. Then it moves the result to ``eax`` as the return value, and finally ``ret`` returns. +Seeing this code for the first time might raise a question: aren't arguments supposed to be on the stack? Why are they fetched directly from ``edi``? This is stipulated by the System V AMD64 ABI (the calling convention for x86-64 on Linux)—the first few integer arguments of a function are passed via registers, with the first argument in ``edi`` and the return value in ``eax``. So the meaning of these three instructions is clear: ``imul edi, edi`` is the two-operand multiplication form of x86—the left operand is both source and destination. It takes the value in ``edi``, multiplies it by itself, writes the result back to ``edi``, moves it to ``eax`` as the return value, and finally ``ret`` returns. -A natural follow-up question is: why not let the result of ``imul`` land directly in ``eax``, avoiding the extra ``mov``? In reality, the two-operand form of ``imul`` writes the result back to the first operand (which is ``edi``), and the calling convention requires the return value to be in ``eax``, so that ``mov`` is unavoidable. If we had the compiler use ``imul eax, edi`` (multiplying ``edi`` into ``eax``), we could indeed skip the ``mov``, but that would require moving ``edi`` into ``eax`` before doing the multiplication. The instruction count would be the same, and GCC chose the former strategy. +A natural question is: why not let the result of ``imul`` land directly in ``eax``, avoiding the extra ``mov``? In reality, the two-operand form of ``imul`` writes the result back to the first operand (i.e., ``edi``), while the calling convention requires the return value to be in ``eax``, so this ``mov`` is unavoidable. If we let the compiler use ``imul eax, edi`` (multiplying ``edi`` into ``eax``), we could save the ``mov``, but that would require moving ``edi`` to ``eax`` first before multiplying, resulting in the same instruction count. GCC chose the former strategy. -Another easy pitfall: if we compile the same code on Windows, the parameter would be placed in ``ecx`` instead of ``edi``, though the return value is still in ``eax``. This is one of the biggest differences between Windows x64 and Linux x86-64—different calling conventions. If we read an assembly listing on Linux and then compile it with MSVC on Windows, we'll find all the registers have changed. We didn't read it wrong; it's just a difference in calling conventions. So when reading assembly, the first step is to confirm the platform and calling convention. This saves a lot of confusion. +Another easy pitfall: if you compile the same code on Windows, the arguments will be in ``ecx`` instead of ``edi``, though the return value is still in ``eax``. This is one of the biggest differences between Windows x64 and Linux x86-64—different calling conventions. If you understand an assembly snippet on Linux and then compile it with MSVC on Windows, you will find the registers have completely changed. This isn't a mistake; it's a difference in calling conventions. So, when reading assembly, the first step is to confirm the platform and calling convention—this saves a lot of confusion. ## The ARM64 Version -Next, let's look at ARM64, also known as AArch64. For the same function, GCC aarch64 at ``-O2`` gives this output: +Next, let's look at ARM64, also known as AArch64. For the same function, GCC aarch64 gives the following output under ``-O2``: -```asm +````asm square(int): mul w0, w0, w0 ret -``` +```` -This code has only two instructions, even cleaner than x86-64. ``w0`` is the register in ARM64 that holds the first integer parameter and the return value (the 32-bit version; the 64-bit version is called ``x0``). Because the parameter is ``int``, 32 bits are sufficient, so the compiler uses the ``w`` register instead of the ``x`` register. The ``mul`` instruction directly puts the result of ``w0`` multiplied by ``w0`` back into ``w0``, then returns. There's no redundant ``mov``—ARM64's instruction design allows the result to be flexibly placed in any of the operand positions. +This code consists of only two instructions, even cleaner than x86-64. ``w0`` is the register in ARM64 that holds the first integer argument and return value (32-bit version; the 64-bit version is called ``x0``). Since the argument is ``int``, 32 bits are sufficient, so the compiler uses the ``w`` register instead of the ``x`` register. The ``mul`` instruction directly places the result of ``w0`` multiplied by ``w0`` back into ``w0``, then returns—no redundant ``mov``. ARM64 instruction design allows the result to be flexibly placed in any operand position. -It's worth noting that ARM64's register naming is much more regular than x86-64's. On the x86-64 side, ``eax``, ``edi``, and ``rsi`` are all different, requiring rote memorization of each register's special purpose. In ARM64, it's simply ``x0`` through ``x30`` plus a stack pointer ``sp``, with the 32-bit versions uniformly getting a ``w`` prefix. It's very neat. This regular naming scheme lowers the barrier to entry—there's no need to memorize a bunch of legacy names; we just need to know that ``x0``/``w0`` are for parameters and return values. +It is worth noting that ARM64 register naming is much more regular than x86-64. In x86-64, ``eax``, ``edi``, and ``rsi`` are all different, requiring rote memorization of each register's specific purpose. In ARM64, it is simply ``x0`` to ``x30`` plus a stack pointer ``sp``, with 32-bit versions uniformly adding a ``w`` prefix. It is very neat. This regular naming lowers the barrier to reading—no need to remember a pile of legacy names, just knowing that ``x0``/``w0`` are for arguments and return values is enough. ## The RISC-V Version -Finally, we have RISC-V (the V stands for the Roman numeral five, so it's pronounced "risk-five"). Its assembly looks like this: +Finally, there is RISC-V (V represents the Roman numeral five, so it is pronounced "Risk-Five"). Its assembly looks like this: -```asm +````asm square(int): mul a0, a0, a0 ret -``` +```` -Wait, this is almost identical to ARM64? Indeed it is. ``a0`` in RISC-V is the register that holds the first parameter and the return value (``a`` stands for argument). ``mul`` does the multiplication, the result goes back into ``a0``, and then it returns. Two instructions, clean and simple. +Wait, isn't this almost identical to ARM64? Indeed it is. ``a0`` in RISC-V is the register holding the first argument and return value (``a`` stands for argument), ``mul`` performs the multiplication, the result is placed back in ``a0``, and then it returns. Two instructions, clean and crisp. -As the youngest instruction set architecture, RISC-V's design drew on the lessons of its predecessors. Its integer registers are simply called ``x0`` through ``x31``, and the ABI assigns them aliases: ``a0``-``a7`` are argument/return value registers, ``t0``-``t6`` are temporary registers, and ``s0``-``s11`` are callee-saved registers. What we see in the assembly are the aliases, but fundamentally they are just ``x`` numbers. This design of "unified underlying numbers + upper-level semantic aliases" is much easier to understand than x86-64's approach where every register has a unique, quirky name. +As the youngest instruction set architecture, RISC-V's design draws on past experience. Its integer registers are simply named ``x0`` to ``x31``, and the ABI assigns them aliases: ``a0``-``a7`` are argument/return value registers, ``t0``-``t6`` are temporary registers, and ``s0``-``s11`` are callee-saved registers. What we see in assembly are the aliases, but fundamentally they are ``x`` numbers. This design of "unified underlying numbering + upper-level semantic aliases" is much easier to understand than the x86-64 approach where every register has a unique name. -## Looking Back: They're Actually Saying the Same Thing +## Looking Back: They Are Actually Saying the Same Thing -Placing the three architectures side by side reveals an interesting phenomenon: although the instruction names, register names, and instruction counts are all different, the "semantics" they express are exactly the same—"fetch parameter → multiply → store return value → return." We don't need to recognize every single instruction to read assembly. As long as we grasp which registers the data flows between and what operation is being performed, we can roughly guess what it's doing. +Placing the three architectures side-by-side reveals an interesting phenomenon: although instruction names, register names, and instruction counts differ, the "semantics" they express are exactly the same—"fetch argument → multiply → place return value → return". Reading assembly doesn't require recognizing every instruction; as long as we grasp which registers data flows between and what operation is performed, we can roughly guess what it is doing. -It's like reading a poem written in a language we're not entirely familiar with. We don't need to look up every single word; just by observing word placement and repetition patterns, we can feel its rhythm and general meaning. Assembly is the same way—seeing ``mul`` or ``imul`` tells us a multiplication is happening, seeing ``ret`` tells us the function is about to return, and seeing data move from one register to another tells us something is being passed along. This ability to "half-read, half-guess" is far more practical than rote memorization of every instruction's exact semantics. +It is like reading a poem written in an unfamiliar language. You don't need to look up every word; you can feel its rhythm and gist through the position of words and repetitive patterns. Assembly is similar—seeing ``mul`` or ``imul`` tells you a multiplication is happening; seeing ``ret`` tells you the function is about to return; seeing data move from one register to another tells you something is being passed. This ability to "half-read, half-guess" is far more practical than rote memorization of the exact semantics of every instruction. -## A Crucial Reminder: Optimization Levels Radically Change What You See +## A Key Reminder: Optimization Levels Radically Change What You See -Everything shown above was output at ``-O2``. If we turn off optimization (``-O0``), we see a completely different picture—massive amounts of ``push``, ``pop``, and memory reads/writes. Parameters get stored to the stack and read back, and intermediate results get repeatedly written to memory. The reason ``-O0`` assembly is so verbose is that ``-O0`` aims to let debuggers precisely map every C++ statement to assembly instructions. Therefore, it performs no optimization, and all variables are dutifully placed in memory. ``-O2`` is the code the compiler "really" wants to generate. If the goal is to understand the compiler's optimization behavior and the actual performance of the code, we must look at the output from ``-O2`` or higher. ``-O0`` will only lead us astray. +The outputs shown above are all under ``-O2``. If optimization is turned off (``-O0``), the scene is completely different—massive amounts of ``push``, ``pop``, and memory reads/writes. Arguments are stored to the stack and read back, and intermediate results are repeatedly written to memory. ``-O0`` assembly is so verbose because ``-O0`` aims to allow the debugger to precisely map every C++ statement to assembly instructions, so it performs no optimization, keeping all variables obediently in memory. ``-O2`` is the code the compiler "truly" wants to generate. If the goal is to understand compiler optimization behavior and actual code performance, we must look at ``-O2`` or higher optimization levels; ``-O0`` will only lead us astray. -At this point, we've walked through the assembly of the simplest function across three mainstream architectures. Even though it's just a ``square`` function, it helped us establish an important cognitive framework: knowing where parameters come from, where results go, and which instruction performs the core operation. With this framework in place, we won't be completely lost when looking at more complex function assembly later. Now, let's take this foundation and look at some more realistic scenarios. +At this point, we have reviewed the assembly of the simplest functions across three mainstream architectures. Although it is just a ``square`` function, it establishes an important cognitive framework: knowing where parameters come from, where results go, and in which instruction the core computation is completed. With this framework, we will not be completely at a loss when looking at more complex function assembly later. Next, with this foundation in hand, let's look at some more realistic scenarios. --- -# What Exactly Is the Relationship Between Machine Code and Assembly? +# What is the Relationship Between Machine Code and Assembly? -Many people use "machine code" and "assembly code" interchangeably, figuring they're both just incomprehensible gibberish anyway. But if we look closely at objdump output, the column of ``0f af ff`` on the left and the column of ``imul edi, edi`` on the right actually have a very straightforward one-to-one mapping relationship—something we rarely stop to think about. +Many people use "machine code" and "assembly code" interchangeably, thinking they are just unintelligible stuff. But looking closely at objdump output, the column of hex on the left (``0f af ff``) and the column of text on the right (``imul edi, edi``) actually has a very straightforward one-to-one mapping, though we rarely think about it seriously. -## Clarifying the Concepts: Machine Code Is for Machines, Assembly Is for Humans +## Clarify Concepts First: Machine Code is for Machines, Assembly is for Humans -That pile of hex numbers on the left—things like ``0f``, ``af``, and ``ff``—that's machine code. Essentially, it's a string of bytes in memory. The CPU reads these bytes directly and interprets them according to rules hardwired into the hardware: when it reads ``0f af``, it knows this is a multiplication instruction, and the subsequent bytes tell it where the operands are. The CPU doesn't know what ``imul`` means; it only understands numbers. +That pile of hexadecimal numbers on the left—``0f``, ``af``, ``ff``, etc.—is machine code. Essentially, it is a string of bytes in memory. The CPU reads these bytes directly and interprets them according to rules hardwired into the hardware: reading ``0f af`` tells it this is a multiplication instruction, and subsequent bytes tell it where the operands are. The CPU doesn't know what ``imul`` is; it only recognizes numbers. -The column of ``imul edi, edi`` on the right is the assembly code, the human-readable version. It has an almost strictly one-to-one mapping with machine code—one assembly instruction corresponds to a fixed-format sequence of machine code bytes. So we can "assemble" assembly code into machine code (which is what an assembler does), and we can "disassemble" machine code back into assembly code (which is what tools like objdump and IDA do). Of course, when disassembling back, the comments are gone, the variable names are gone, and semantic information like ``int x = n * n`` is completely lost. All that's left to see are cold, hard instructions. +The column of text on the right, ``imul edi, edi``, is assembly code, the version for humans. It has an almost one-to-one mapping with machine code—one assembly instruction corresponds to a fixed-format sequence of machine code bytes. Therefore, we can "assemble" assembly code into machine code (what an assembler does) and "disassemble" machine code back into assembly code (what tools like objdump and IDA do). Of course, when disassembling back, comments are lost, variable names are lost, and semantic information like ``int x = n * n`` is completely gone—only cold instructions remain. -But this two-way conversion path exists, and it's very direct. Assembly is not a "high-level language" that requires a compiler to do complex translation—it's almost just another way of writing machine code. +But this bidirectional conversion path exists and is very direct. Assembly is not a "high-level language" requiring a compiler to perform complex translation—it is almost just another way of writing machine code. -## Let's Write the Simplest Square Function and See What the Assembly Looks Like +## Write a Simple Square Function and See What the Assembly Looks Like -To figure out what's going on with registers, we start with the most plain square function: +To figure out the register situation, let's start with the most basic square function: -```cpp +````cpp // square.cpp int square(int n) { return n * n; } -``` +```` -Then we compile it into an object file with gcc, without linking, just to look at the assembly: +Then compile it with gcc into an object file, without linking, just to see the assembly: -```bash +````bash # 我的环境:Arch Linux WSL, x86-64, gcc 16.1.1 g++ -c -O0 square.cpp -o square.o objdump -d -M intel square.o -``` +```` -Adding ``-M intel`` is because AT&T syntax (where operands come after the instruction and have ``%`` prefixes) isn't very intuitive. Intel syntax at least has an operand order that matches our intuition. ``-O0`` turns off all optimization so the compiler won't rewrite the code in any way, letting us see the most raw translation result. +Adding ``-M intel`` is because AT&T syntax (operands at the end, with ``%`` prefixes) is not very intuitive, while Intel syntax at least has operand order consistent with intuition. ``-O0`` turns off all optimizations so the compiler doesn't rewrite the code, allowing us to see the most raw translation result. The output looks roughly like this (GCC 16, -O0): -```asm +````asm 0000000000000000 <_Z6squarei>: 0: 55 push rbp 1: 48 89 e5 mov rbp,rsp @@ -148,280 +148,280 @@ The output looks roughly like this (GCC 16, -O0): a: 0f af c0 imul eax,eax d: 5d pop rbp e: c3 ret -``` +```` -Our first reaction upon seeing this might be: wait, shouldn't the input parameter be "passed in" from somewhere? A C++ function has a parameter list, but assembly has no such thing. Where did the parameter go? +The first reaction to seeing this might be: wait, shouldn't input parameters be "passed in" from somewhere? C++ functions have parameter lists, but assembly has no such thing. Where did the parameters go? -## Registers Are the CPU's Built-in "Global Variables," But Their Use Has Rules +## Registers are the CPU's Built-in "Global Variables", But Their Use Has Rules -Inside the CPU, there is a small batch of extremely fast storage units called registers. We can think of them as "ultra-high-speed global variables"—they live directly inside the CPU, require no memory access, and have nearly zero-latency reads and writes. But unlike global variables, the number of registers is extremely limited. On x86-64, there are only a dozen or so general-purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, R8-R15), so it's impossible to fit all data into them. +Inside the CPU, there is a small batch of extremely fast storage units called registers. We can understand them as a kind of "ultra-high-speed global variable"—directly inside the CPU, no memory access required, and read/write latency is nearly zero. But unlike global variables, the number of registers is extremely limited. In x86-64, there are only a dozen or so general-purpose registers (RAX, RBX, RCX, RDX, RSI, RDI, R8-R15), so it is impossible to stuff all data into them. -The key question is: who dictates which register does what? If compiler A thinks parameters should go in RAX, and compiler B thinks they should go in RDI, then code compiled by the two couldn't possibly call each other. We write a library, someone else writes a program, and because the register usage is inconsistent, the call fails. +The key question is: who dictates which register does what? If compiler A thinks arguments go in RAX, and compiler B thinks they go in RDI, then the code they compile cannot call each other. You write a library, someone else writes a program, and if register usage doesn't match, the call fails. -So there must be a set of "traffic rules" that everyone follows, so that code can interoperate. This set of rules is the ABI (Application Binary Interface). The ABI specifies many things, but the most fundamental rule for our purposes is: during a function call, which registers hold the parameters, which register holds the return value, which registers can be freely clobbered after the call, and which must be preserved exactly as they were. +Therefore, there must be a set of "traffic rules" that everyone follows for code to interoperate. This set of rules is the ABI (Application Binary Interface). The ABI specifies many things, one of the most basic being: when a function is called, which register holds arguments, which register holds the return value, and which registers can be freely modified after a call versus which must be restored to their original state. -Linux uses the System V AMD64 ABI, while Windows uses Microsoft's own x64 ABI. The two sets of rules are different. This is one of the reasons why Linux and Windows binaries can't be directly mixed (there are of course more reasons, but the difference in register conventions is the most direct layer). +Linux uses the System V AMD64 ABI, while Windows uses Microsoft's own x64 ABI. The two sets of rules are different. This is one of the reasons why binaries from Linux and Windows cannot be directly mixed (of course, there are more reasons, but the register convention difference is the most immediate layer). -## Parameters Come In via EDI, Results Must Go Out via EAX +## Parameters Enter via EDI, Results Must Exit via EAX -Returning to our square function. Under the System V ABI rules, the first integer parameter is placed in the RDI register. Note that we wrote RDI (64-bit), but our parameter is ``int``, which is only 32 bits. So in practice, we're using the lower 32 bits of RDI, which is EDI. The same logic applies to RAX/EAX: RAX is the 64-bit version, and EAX is the 32-bit version. +Returning to our square function. Under System V ABI rules, the first integer argument is placed in the RDI register. Note I wrote RDI (64-bit), but our parameter is ``int``, only 32 bits, so it actually uses the low 32 bits of RDI, which is EDI. The same applies to RAX/EAX; RAX is the 64-bit version, EAX is the 32-bit version. -So the moment the function is entered, the value of ``n`` is already in EDI. We don't need to "fetch" it from anywhere; it's already there. +So when the function starts, the value of ``n`` is already in EDI; you don't need to "fetch" it from somewhere, it is already there. -Then look at the instruction sequence: ``push rbp; mov rbp, rsp`` is the standard stack frame setup, and ``mov DWORD PTR [rbp-0x4], edi`` stores the parameter from EDI onto the stack—this is typical ``-O0`` behavior. The compiler performs no optimization, so all variables are dutifully placed in memory. Next, ``mov eax, DWORD PTR [rbp-0x4]`` reads it back from the stack into EAX, ``imul eax, eax`` squares it, ``pop rbp`` restores the stack frame, and finally ``ret`` returns. The verbosity of ``-O0`` perfectly illustrates why we recommended looking at ``-O2`` output earlier—three extra stack frame manipulation instructions end up drowning out the core logic. +Then look at the instruction sequence: ``push rbp; mov rbp, rsp`` is the standard stack frame setup, ``mov DWORD PTR [rbp-0x4], edi`` stores the parameter from EDI onto the stack—this is typical ``-O0`` behavior; the compiler performs no optimization and obediently places all variables in memory. Then ``mov eax, DWORD PTR [rbp-0x4]`` reads it back from the stack into EAX, ``imul eax, eax`` performs the square, ``pop rbp`` restores the stack frame, and finally ``ret`` returns. The verbosity of ``-O0`` precisely illustrates why we recommended looking at ``-O2`` output earlier—three extra stack frame instructions drown out the core logic. -Then ``imul eax, eax`` multiplies EAX by EAX, storing the result back into EAX. This is a very distinctive design feature of x86: most instructions only accept two operands, and the left operand is both the source and the destination. This is the same idea as ``a *= a`` in C++—read the value on the left, perform the operation with the value on the right, and write the result back to the left. It's a "destructive" operation; once it's done, the original value on the left is overwritten. If we need the original value later, we have to save it beforehand. +Next, ``imul eax, eax`` multiplies EAX by EAX, storing the result back in EAX. This is a distinctive design of x86: most instructions accept only two operands, and the left operand is both source and destination. This is the same meaning as ``a *= a`` in C++—read the value on the left, operate with the value on the right, and write back to the left. It is a "destructive" operation; after it is done, the original value on the left is overwritten. If the original value is needed later, it must be saved in advance. -Finally, ``ret`` returns, handing control back to the caller. At this point, EAX holds the squared result, and the caller knows to grab it from EAX—because that's what the ABI dictates. +Finally, ``ret`` is the return, handing control back to the caller. At this point, EAX holds the square result, and the caller knows to fetch it from EAX—because the ABI so stipulates. ## Register Names Are Not Arbitrary -Beginners seeing a bunch of names like RAX, EAX, AX, and AL might easily assume they are different registers. In reality, they are different "views" of the same physical register: RAX is the full 64 bits, EAX is the lower 32 bits, AX is the lower 16 bits, and AL is the lowest 8 bits. Writing data to EAX overwrites the upper 32 bits of RAX (zeroing them), while writing to AL only changes the lowest byte, leaving the remaining bits unaffected. +Beginners seeing RAX, EAX, AX, AL might think they are different registers. In reality, they are different "views" of the same physical register: RAX is the full 64 bits, EAX is the low 32 bits, AX is the low 16 bits, and AL is the lowest 8 bits. Writing to EAX overwrites the high 32 bits of RAX (zeroing them), while writing to AL only changes the lowest byte, leaving the rest unaffected. -This characteristic is particularly prone to causing confusion during debugging. While staring at the register window, we might notice that the value of RAX doesn't match EAX, and think the debugger is glitching. Actually, it's because a certain instruction only modified the lower 32 bits, and the upper 32 bits are dirty data left over from a previous operation. So when looking at registers, we must be clear about which "view" we are currently looking at. +This characteristic is particularly prone to causing confusion during debugging. Staring at the register window, you might notice that the value of RAX doesn't match EAX and suspect the debugger is broken, but actually, it is because a certain instruction only modified the low 32 bits, and the high 32 bits are dirty data left over from a previous operation. So when looking at registers, be sure to clarify which "view" you are looking at. -At this point, the assembly face of the simplest C++ function on x86-64 is clear: parameters are passed in via registers (not the stack—at least the first few parameters aren't), computation happens between registers, and results are returned via registers. The whole process involves no memory access and is extremely fast. Of course, this is just the simplest case. With more parameters, local variables, and optimizations turned on, things get much more complicated, but the foundational framework remains the same. +At this point, the assembly face of a simple C++ function under x86-64 is clear: parameters are passed in via registers (not the stack, at least for the first few), computation is done between registers, and results are returned via registers. The whole process involves no memory access and is extremely fast. Of course, this is the simplest case; with more parameters, local variables, and optimizations enabled, things get much more complex, but the basic framework remains this set. --- # Understanding Register Parameter Passing from a Single MOV Instruction: ARM and RISC-V Calling Conventions -In the previous section, we looked at that square function. After compilation, the core was just a single multiplication instruction. When the function returns, control goes back to the caller. The caller had previously stuffed the parameter into the EDI register (per the x86-64 calling convention), and now it expects to get the return value from the EAX register—this is the x86-64 rule: integer return values go through EAX (or RAX). So what that ``imul edi, edi`` does is very straightforward: it multiplies the value in EDI by itself, writes the result back to EDI, then moves it to EAX, and finally returns. The caller grabs it from EAX, and we're done. +The previous section discussed that square function. After compilation, the core is a single multiplication instruction. When the function returns, control is handed back to the caller. The caller previously stuffed the parameter into the EDI register (the x86-64 calling convention) and now expects to get the return value from the EAX register—this is the x86-64 rule: integer return values go via EAX (or RAX). So that ``imul edi, edi`` instruction does something very straightforward: multiply the value in EDI by itself, write the result back to EDI, then mov to EAX, and finally ret. The caller fetches it from EAX, done. -So the question arises: across different architectures, how big is the "felt" difference when doing the exact same thing? If we compile the same function under all three architectures and compare the assembly instruction by instruction, the differences are very pronounced. +So the question is: under different architectures, how big is the "perceptual" difference in doing the same thing? Compiling the same function under three architectures and comparing the assembly line by line reveals very obvious differences. -## The Cleanliness of ARM64 +## The Simplicity of ARM64 -Let's look at ARM64 (AArch64) first. Some people might think ARM assembly is roughly the same as x86, just with different instruction names. But actually opening the objdump output reveals that the differences far exceed expectations. +First, look at ARM64 (AArch64). Some might think ARM assembly is similar to x86, just with different instruction names. Actually opening objdump reveals differences far beyond expectations. -```cpp +````cpp // square.cpp —— 就这么个简单函数 int square(int value) { return value * value; } -``` +```` -Let's run it with a cross-compilation toolchain: +Run it with a cross-compilation toolchain: -```bash +````bash # ARM64 aarch64-linux-gnu-g++ -O2 -c square.cpp -o square_arm64.o aarch64-linux-gnu-objdump -d square_arm64.o -``` +```` -The output looks like this: +The output is like this: -```asm +````asm square: mul w0, w0, w0 ret -``` +```` -That's it. Two instructions, clean as a whistle. One particularly nice thing is that W0 is both the input and the output. In ARM's calling convention, W0 (32-bit) or X0 (64-bit) serves as both the carrier for the first parameter and the return value. So ``mul w0, w0, w0`` reads as "multiply w0 by w0, put the result back in w0." All three operands are the exact same register, which is visually extremely consistent. +That's it. Two instructions, clean and crisp. One particularly comfortable aspect is: W0 is both input and output. In ARM's calling convention, W0 (32-bit) or X0 (64-bit) serves as the carrier for the first argument and also for the return value. So ``mul w0, w0, w0`` reads as "multiply w0 by w0, put the result back in w0". All three operands are the same register; visually, it is extremely unified. Next, let's look at the machine code for these instructions. This reveals an important design difference. -```bash +````bash aarch64-linux-gnu-objdump -d -j .text square_arm64.o | grep mul # 0: 1b007c00 mul w0, w0, w0 -``` +```` ``1b007c00``, four bytes. Now look at that ``ret``: -```asm +````asm # 4: d65f03c0 ret -``` +```` -``d65f03c0``, also four bytes. Two instructions, both exactly four bytes. This means the instruction decoder's job is particularly simple: the instruction fetch stage simply grabs four bytes at a time, without needing to do any length determination. The elegance of this design becomes even more obvious when contrasted with x86. +``d65f03c0``, also four bytes. Two instructions, both exactly four bytes. This means the instruction decoder's job is very simple; the fetch stage fetches a fixed four bytes each time without any length judgment. This design is elegant, especially when contrasted with x86. -## x86's Variable-Length Instructions +## x86 Variable-Length Instructions -The same function, compiled for x86-64: +The same function compiled under x86-64: -```bash +````bash g++ -O2 -c square.cpp -o square_x64.o objdump -d square_x64.o -``` +```` -```asm +````asm square(int): 0: 0f af ff imul edi,edi 3: 89 f8 mov eax,edi 6: c3 ret -``` +```` -The focus here is on the byte length of the instructions: +The focus is on the byte length of the instructions: - ``imul`` instruction: ``0f af ff``, three bytes - ``mov`` instruction: ``89 f8``, two bytes - ``ret`` instruction: ``c3``, one byte -Three instructions, three different lengths: 3, 2, and 1. If we use a different multiplication variant, like ``imul eax, edi``, its machine code is ``0f af c7``, still three bytes, but with a different suffix than the imul above (``ff`` vs ``c7``) because the operand encoding is different. Change the scenario again—if the multiplier is an immediate value, the instruction length changes yet again. +Three instructions, three lengths: 3, 2, 1. Change the multiplication method, say ``imul eax, edi``, its machine code is ``0f af c7``, still three bytes, but the suffix differs from the imul above (``ff`` vs ``c7``) because the operand encoding is different. Change the scenario again, and if the multiplier is an immediate, the instruction length changes again. -"Variable-length instructions" aren't just a textbook concept. Counting bytes against a hex dump reveals that every time the CPU's front end fetches an instruction, it has to read the first few bytes to determine exactly how long that instruction is, and only then can it decide where the next instruction starts. x86 decoders are notoriously complex. To solve this problem, Intel packed massive amounts of pre-decode logic and micro-op caches into their CPUs—essentially using hardware brute force to compensate for the historical baggage of the instruction set design. +"Variable-length instructions" is not just a textbook concept. Counting bytes against a hex dump reveals that every time the CPU front-end fetches an instruction, it must read the first few bytes to judge how long the instruction actually is before it can decide where the next instruction starts. x86 decoders are notoriously complex; to solve this, Intel stuffed a large amount of pre-decoding logic and micro-op caches into the CPU, essentially using hardware brute force to compensate for the historical baggage of instruction set design. -## RISC-V's Fixed-Length Instructions +## RISC-V Fixed-Length Instructions -Now let's look at RISC-V (rv64gc): +Now look at RISC-V (rv64gc): -```bash +````bash riscv64-linux-gnu-g++ -O2 -c square.cpp -o square_rv64.o riscv64-linux-gnu-objdump -d square_rv64.o -``` +```` -```asm +````asm square: 0: 02b50533 mul a0, a0, a0 4: 8082 ret -``` +```` -Just like ARM, a0 is both the first parameter and the return value, and the ``mul a0, a0, a0`` semantics are completely identical. However, there's a detail here: the ``mul`` instruction is four bytes (``02b50533``), but the ``ret`` instruction is only two bytes (``8082``). RISC-V's base instructions are fixed-length four bytes, but it supports the 16-bit Compressed Instruction extension (RVC), so common instructions like ``ret`` get compressed into two bytes. This can be seen as a compromise between fixed-length and variable-length—still much more disciplined than x86's "completely unpredictable" variable length. +Like ARM, a0 is both the first argument and the return value, and ``mul a0, a0, a0`` semantics are identical. However, there is a detail: the ``mul`` instruction is four bytes (``02b50533``), but the ``ret`` instruction is only two bytes (``8082``). RISC-V base instructions are fixed four-byte, but it supports a 16-bit compressed instruction extension (RVC), so common instructions like ``ret`` are compressed into two bytes. This is a compromise between fixed-length and variable-length, much more disciplined than x86's "completely unpredictable" variability. -## Operand Count: Not All Instructions Are So Neat +## Number of Operands: Not All Instructions Are So Neat -At this point, we might think that instructions are just "opcode + a few operands"—pretty neat and tidy. But after looking through more assembly, we find that reality is far from this rosy. +At this point, you might think instructions are just "opcode + a few operands", quite neat. But looking through more assembly reveals that reality is far less pretty. -The ``mul`` and ``imul`` we saw above are typical three-operand instructions (destination + source1 + source2) or two-operand instructions (where the destination is also source1). But there are many instructions that don't play by these rules at all. Zero-operand instructions are the simplest, like ``ret`` and ``nop``, which don't need any extra information. Single-operand instructions are also common, like various jump instructions. We just saw double and triple operands. +The ``mul`` and ``imul`` seen above are typical three-operand instructions (destination + source1 + source2), or two-operand (destination is also source1). But many instructions don't follow the pattern at all. Zero-operand instructions are simplest, like ``ret`` and ``nop``, needing no extra information. Single-operand is also common, like various jump instructions. Two- and three-operand we just saw. -But what's truly confusing is "implicit operands." For example, x86 has an instruction ``rep stosb`` that "repeatedly writes the value of the AL register into the memory pointed to by RDI (or EDI), automatically incrementing RDI/EDI after each write, with the repeat count controlled by RCX (or ECX)." AL, RDI/EDI, RCX/ECX—not a single one of these three operands is visible in the instruction text. They are all implicit, hardcoded into the instruction definition. Anyone reading the assembly must remember which registers this instruction uses by default. The "operand count" of such an instruction is actually very hard to define. +What is truly confusing is "implicit operands". For example, in x86 there is a ``rep stosb`` instruction that functions to "write the value of the AL register repeatedly to the memory pointed to by RDI (or EDI), incrementing RDI/EDI after each write, with the repeat count controlled by RCX (or ECX)". AL, RDI/EDI, RCX/ECX—none of these three operands are visible in the instruction text; they are all implicit, hardcoded in the instruction definition. The person reading the assembly must remember which registers this instruction uses by default. The "number of operands" for such instructions is actually hard to define. ## Intel's Historical Baggage -The problem of implicit operands makes x86 a veritable "disaster zone." The reason isn't complicated: the x86 instruction set started with the 8086 in 1978 and evolved all the way to today's x86-64, spanning over 40 years. Each new generation of CPU had to add new things on top of the old instruction set, and it had to maintain backward compatibility—8086 machine code written in 1985 will still run perfectly on a CPU in 2026. This constraint sounds wonderful, but the price is that the instruction set became increasingly bloated and irregular. The encoding space for new instructions was already occupied by old instructions, so they could only use various prefix bytes to extend it, leading to increasingly complex decoding logic. +The implicit operand problem makes x86 a "hard-hit zone". The reason isn't complex: the x86 instruction set evolved from the 8086 in 1978 all the way to today's x86-64, spanning more than 40 years. Each new generation of CPUs had to add new things on top of the old instruction set while maintaining backward compatibility—8086 machine code written in 1985 will still run on a CPU in 2026. This constraint sounds wonderful, but the cost is that the instruction set becomes increasingly bloated and irregular. The encoding space for new instructions is occupied by old instructions, so prefix bytes must be used for expansion, leading to increasingly complex decoding logic. -Does this situation sound familiar? C++'s backward compatibility issues are almost exactly the same—when writing C++26 code today, the compiler still has to handle C89-style declarations, C-style casts, and various legacy features. Every time someone proposes "let's delete such-and-such old feature," the answer is always "no, it will break existing code." So we carry this baggage and keep moving forward. +Does this situation sound familiar? C++'s backward compatibility issues are almost exactly the same—writing C++26 code today, the compiler still has to handle C89-style declarations, C-style casts, and various legacy features. Every time someone proposes "deleting some old feature", the answer is always "no, it will break existing code". So we move forward carrying this baggage. -By comparison, ARM and RISC-V are much cleaner. ARM64 was designed around 2011 (AArch64), making it a "clean-room implementation"—not burdened with 32-bit ARM's historical legacy, it redesigned a completely new instruction encoding. RISC-V is even more of an academic project started from scratch in 2010, with excellent instruction orthogonality: the same opcode format can be used just by swapping the register number. There are no maddening rules like "this instruction implicitly uses EAX, that instruction implicitly uses EDX." +In contrast, ARM and RISC-V are much cleaner. ARM64 was designed around 2011 (AArch64), a "clean room implementation"—not carrying 32-bit ARM's historical baggage, it redesigned a set of instruction encodings. RISC-V is even an academic project starting from scratch in 2010, with excellent instruction orthogonality: the same opcode format, change the register number and it works; there are no maddening rules like "this instruction implicitly uses EAX, that instruction implicitly uses EDX". ## Register Naming: The Origin of the A Register -We've been talking about names like EAX, W0, and a0, but have you ever wondered why x86 registers have these weird names? There is historical meaning behind these names. +We've been talking about EAX, W0, a0, but have you ever thought about why x86 registers have these strange names? There is historical meaning behind these names. -In x86, there is a register called A (Accumulator). In the 8080 era or even earlier with the 8008, the A register was simply "the default register"—many operations defaulted to acting on A, without needing to specify it in the instruction. For example, the instruction encoding for "add a certain value to A" is shorter than the encoding for "add a certain value to B," because A is the "default destination," saving the few bits needed to specify the destination register. +In x86, there is a register called A (Accumulator). In the 8080 or even earlier 8008 era, the A register was "the default register"—many operations defaulted to acting on A, without needing to specify it in the instruction. For example, addition, the instruction encoding for "add a value to A" is shorter than "add a value to B", because A is the "default target", saving the bits needed to specify the target register. -This design philosophy carried all the way through to x86. Today, if we write ``imul edi, edi`` and change it to ``imul ebx, ebx``, the machine code might be longer (depending on the specific encoding), because EAX (or rather RAX) is still a "privileged register" in many instructions—it's the default destination for implicit instructions and a fixed participant in certain special operations (like the high bits of ``mul``'s double-precision result being placed in EDX). +This design philosophy continued into x86. Today writing ``imul edi, edi``, if changed to ``imul ebx, ebx``, the machine code might be longer (depending on the specific encoding), because EAX (or RAX) is still a "privileged register" in many instructions—it is the implicit default target for many instructions, and a fixed participant in certain special operations (like the high bits of the double-precision result of ``mul`` being placed in EDX). -Many tutorials always say "try to use EAX." This isn't some mystical optimization trick; it's a "privilege" granted at the instruction set encoding level—using the A register might make instructions shorter and decoding faster. Of course, on modern CPUs, this difference has been largely smoothed over by various microarchitectural optimizations, but once we understand this background, those instructions with implicit operands no longer seem so baffling. +Many tutorials say "try to use EAX". This isn't some mystical optimization trick; it's a "privilege" given at the instruction set encoding level—using the A register can make instructions shorter and decoding faster. Of course, on modern CPUs this difference has been smoothed out by many microarchitectural optimizations, but understanding this background makes those implicit operand instructions seem less baffling. -At this point, we've walked through "what a simple function call actually looks like at the assembly level" from start to finish: from how parameters are passed and return values are placed, to the instruction encoding differences across architectures, to the historical origins of register naming. No single step is complicated, but when we put them all together and look at the big picture, the entire system connects. +At this point, "what a simple function call looks like at the assembly level" has been thoroughly worked through: from how parameters are passed and return values placed, to instruction encoding differences across architectures, to the historical origins of register naming. Each step isn't complex, but when pieced together, the whole system connects. --- --- -# Figuring Out Where Parameters Go During Function Calls—From Register Naming to the ABI +# Figuring Out Where Parameters Go During Function Calls—From Register Naming to ABI -When looking at assembly code generated by Compiler Explorer, the biggest psychological barrier is often not the instructions themselves, but those messy register names. RAX, EAX, AX, AL, AH—are these one thing or four things? Once we clarify x86's register layout, this problem is easily solved. +When looking at assembly code generated by Compiler Explorer, the biggest psychological barrier is often not the instructions themselves, but the messy register names. RAX, EAX, AX, AL, AH—are these one thing or four things? Once we understand the x86 register layout, this problem is solved. -## First, Let's Clarify the Relationship Between RAX, EAX, and AX +## First, Clarify the Relationship Between RAX, EAX, and AX -Going back to the most fundamental question: what is a register? We can think of it as a small row of ultra-high-speed storage slots inside the CPU, extremely limited in number. In the 8-bit era, the most core register was called the A register, short for Accumulator. Most arithmetic operations revolved around it. Later, as CPUs evolved from 8-bit to 16-bit, 32-bit, and 64-bit, this register's width grew along with them, but its "status" never changed—it has always been the general-purpose register bearing the primary computational workload. +Back to the most fundamental question: what is a register? We can understand it as a small row of ultra-high-speed storage cells inside the CPU, extremely limited in quantity. In the 8-bit era, the most core register was the A register, or Accumulator, around which most arithmetic operations revolved. Later, CPUs evolved from 8-bit to 16-bit, 32-bit, and 64-bit. The width of this register grew, but its "status" remained unchanged—it is always the general-purpose register bearing the main computational load. -The key point is: when we see RAX, we're looking at a 64-bit value. But when we see EAX, we're not looking at another register; we're looking at **the lower 32 bits of the exact same register**. Similarly, AX is the lower 16 bits, AL is the lowest 8 bits, and AH is the second-to-lowest 8 bits (that is, bits 8-15). They all point to the exact same physical storage, just "slicing" it with different names. +The key is: when you see RAX, you are seeing a 64-bit value. But when you see EAX, you are not seeing another register, but **the low 32 bits of the same register**. Similarly, AX is the low 16 bits, AL is the lowest 8 bits, and AH is the second lowest 8 bits (bits 8-15). They all point to the same physical storage, just "sliced" by different names. -Let's use a simple diagram to illustrate: +A simple diagram illustrates this: -```text +````text 63 31 15 7 0 +--------------------------------+----------+----+----+ | RAX | EAX | AX | | | +----+----+ | | | AH | AL | +--------------------------------+----------+----+----+ -``` +```` -So when we see code like this in assembly, there's no need to panic: +So when you see code like this in assembly, don't panic: -```asm +````asm mov rax, rdi ; 把 64 位参数放进 rax 做计算 shr rax, 32 ; 右移 32 位 mov eax, eax ; 只保留低 32 位作为返回值 -``` +```` -Here, switching from rax to eax doesn't mean data is being shuffled between two registers. Rather, the compiler is saying "the calculation is done, and now we only care about the lower 32 bits." Type information from the C++ source code (for example, if the parameter is int64_t but the return value is int32_t) is directly reflected in the assembly's use of different names for the same register. After type information disappears, this is how it "lingers" in the assembly. +Here, switching from rax to eax doesn't mean data is moving between two registers; it is the compiler saying "calculation is done, now we only care about the low 32 bits". Type information from the C++ source (e.g., the parameter is int64_t but the return value is int32_t) is directly reflected in the assembly's use of different names for the same register. After type information disappears, it "lingers" in the assembly in this way. -## Those Weirdly Named Registers, and Their Easy-to-Remember New Friends +## Those Weirdly Named Registers, and Easy-to-Remember New Friends -Once we understand the naming pattern of RAX, we might wonder: what about the rest? RAX, RCX, RDX, RSP, RBP, RSI, RDI... these names seem to follow no pattern at all. They are all legacy names inherited from ancient times: A is for Accumulator, C is for Counter, D is for Data, SP is for Stack Pointer, BP is for Base Pointer, and SI and DI are for Source Index and Destination Index, respectively. Knowing the historical background makes them slightly easier to remember, but to a large extent, it still relies on muscle memory built through repeated use. +Once you understand the naming pattern of RAX, you might wonder about the others. RAX, RCX, RDX, RSP, RBP, RSI, RDI... these names seem completely lawless. They are all legacy names inherited from ancient times: A is Accumulator, C is Counter, D is Data, SP is Stack Pointer, BP is Base Pointer, SI and DI are Source Index and Destination Index. Knowing the historical background makes them slightly easier to remember, but largely it relies on muscle memory formed through repeated use. -There is good news, however: when AMD extended the architecture from 32-bit to 64-bit, the eight new general-purpose registers were simply named R8 through R15. Clean and simple. So today, x86-64 has a total of 16 general-purpose registers—eight with weird legacy names, and eight with clean numeric designations. +However, there is good news: when AMD extended the architecture from 32-bit to 64-bit, the 8 new general-purpose registers were directly named R8 to R15. Clean and simple. So x86-64 now has 16 general-purpose registers, 8 with weird legacy names and 8 with clean numeric names. -There are also SIMD/multimedia registers (like XMM/YMM/ZMM), but those are a whole other topic. Today, we'll focus on general-purpose registers and function calls. +Of course, there are SIMD/multimedia registers (XMM/YMM/ZMM, etc.), but that is another large topic; today we focus on general-purpose registers and function calls. -## Which Register Are Function Parameters Actually In? +## Which Register Are Function Arguments In? -One of the biggest confusions when reading assembly is: we write a function, pass three parameters into it, and the assembly turns into a bunch of mov instructions shuffling data between registers. Where do the parameters actually come from? This brings us to the ABI (Application Binary Interface). +One of the biggest confusions in reading assembly is: you write a function, pass three arguments in, and the assembly turns into a bunch of mov instructions shuffling data between registers. Where did the arguments come from? This involves the ABI (Application Binary Interface). -The ABI specifies many things, but from the perspective of reading assembly, the one thing we care about most is: **which registers hold the first few parameters of a function**. As long as we know this, we can track what a C++ variable turned into in the assembly. +The ABI specifies many things, but from the perspective of reading assembly, the one concern is: **which registers hold the first few arguments of a function**. As long as we know this, we can trace what C++ variables became in the assembly. -Take Linux (System V AMD64 ABI) as an example. The first six integer parameters (including pointers) are placed in these registers in order: +Take Linux (System V AMD64 ABI) as an example. The first six integer arguments (including pointers) are placed in these registers in order: -```text +````text 第 1 个参数 → RDI 第 2 个参数 → RSI 第 3 个参数 → RDX 第 4 个参数 → RCX 第 5 个参数 → R8 第 6 个参数 → R9 -``` +```` -Any parameters beyond the first six must be pushed onto the stack and accessed via stack pointer offsets. When using ``std::forward`` for perfect forwarding, if there are many parameters, we'll see a lot of stack operations in the assembly, because forwarding might "unroll" the parameters, causing the count to suddenly exceed the capacity of the six registers. +Arguments exceeding six must be pushed onto the stack, accessed via stack pointer offsets. When using ``std::forward`` for perfect forwarding, if there are many parameters, the assembly will show a lot of stack operations because forwarding may "expand" the parameters, suddenly exceeding the capacity of six registers. -Return values are simpler: they uniformly go in RAX (if it's a 128-bit return value, RDX:RAX are combined). +Return values are simpler, uniformly placed in RAX (if the return value is 128 bits, RDX:RAX are combined). -Floating-point parameters are slightly more complex, traveling through a separate set of registers (XMM0 through XMM7), but the basic idea is the same—the first few go in registers, and any extras go on the stack. +Floating-point arguments are slightly more complex, using a separate set of registers (XMM0 to XMM7), but the basic idea is the same—the first few go in registers, the rest go on the stack. -## Windows Has Different Rules +## Windows Rules Are Different -If we use MSVC on Windows, the situation is different. The Windows x64 ABI only provides four registers for passing parameters: +If using MSVC on Windows, the situation is different. The Windows x64 ABI allocates only four registers for passing arguments: -```text +````text 第 1 个参数 → RCX 第 2 个参数 → RDX 第 3 个参数 → R8 第 4 个参数 → R9 -``` +```` -Note that both the order and the names are different from Linux. This means that for the exact same function, the first six parameters on Linux all go through registers, but on Windows, the fifth and sixth parameters already have to be pushed to the stack. When debugging cross-platform performance issues, the exact same C++ code produces completely different assembly on the two sides, and this ABI difference is often the culprit. +Note the order and names differ from Linux. This means the same function on Linux passes the first six arguments via registers, while on Windows the fifth and sixth are already pushed to the stack. When debugging performance issues across platforms, the same C++ code looks completely different in assembly on both sides, often caused by ABI differences. -This difference actually has a subtle impact on API design. If we know that only four registers are available on Windows, we'll be more inclined to limit the number of parameters when designing frequently called interfaces. But we'll expand on this topic when we encounter specific scenarios later. +This difference actually has a subtle impact on API design. If you know only four registers are available on Windows, you tend to control the number of parameters when designing high-frequency interfaces. But we will expand on this topic later in specific scenarios. -## Let's Verify This Hands-On +## Verify It Yourself -Talk is cheap, let's write the simplest function and throw it into Compiler Explorer: +Talk is cheap, let's write a simple function and throw it into Compiler Explorer: -```cpp +````cpp // 编译选项:-O1 -m64 // 平台:x86-64 Linux (GCC) long add_three(long a, long b, long c) { return a + b + c; } -``` +```` The corresponding assembly looks roughly like this (GCC 16, -O1): -```asm +````asm add_three(long, long, long): add rdi, rsi ; rdi(a) += rsi(b) lea rax, [rdi + rdx*1] ; rax = rdi + rdx(c) ret -``` +```` -See? a is in RDI, b is in RSI, and c is in RDX, perfectly matching the rules we discussed. The return value is in RAX. Clean. +See, a is in RDI, b is in RSI, c is in RDX, completely consistent with our rules. The return value is in RAX. Clean. -Let's try one with more than six parameters: +Try one with more than six arguments: -```cpp +````cpp long sum_seven(long a, long b, long c, long d, long e, long f, long g) { return a + b + c + d + e + f + g; } -``` +```` -The assembly becomes this: +The assembly becomes: -```asm +````asm sum_seven(long, long, long, long, long, long, long): lea rax, [rdi + rsi] ; a + b add rax, rdx ; + c @@ -430,10 +430,22 @@ sum_seven(long, long, long, long, long, long, long): add rax, r9 ; + f add rax, QWORD PTR [rsp+8] ; + g,从栈上取!注意偏移 +8,因为 [rsp] 是 call 压入的返回地址 ret -``` +```` -The first six parameters are in RDI, RSI, RDX, RCX, R8, and R9, respectively. The seventh parameter, g, ends up on the stack, accessed via ``[rsp+8]`` (the ``call`` instruction pushed the return address into ``[rsp]``, so the first stack parameter requires an 8-byte offset). Once we know the ABI rules, reading assembly is like having a map—it's no longer a screen full of gibberish. +The first six arguments are in RDI, RSI, RDX, RCX, R8, R9, and the seventh argument g has run onto the stack, accessed via ``[rsp+8]`` (the ``call`` instruction pushed the return address onto ``[rsp]``, so the first stack argument needs an offset of 8 bytes). Knowing the ABI rules makes reading assembly like having a map—no longer a screen of gibberish. -## A Quick Note on ARM64 +## By the Way, Mentioning ARM64 -If we've worked with ARM64 (like Apple Silicon or embedded development), things are much cleaner over there. The general-purpose registers are simply called X0 through X30, with no historical baggage. Function parameters are just X0, X1, X2, and so on, and the return value is in X +If you have touched ARM64 (like Apple Silicon or embedded development), it is much cleaner over there. General-purpose registers are directly called X0 to X30, no historical baggage. Function arguments are X0, X1, X2... in order, return value in X0. If you want to see the 32-bit version, just replace X with W, e.g., W0 is the low 32 bits of X0. The naming logic is the same as x86's RAX/EAX, but the names are much easier to remember. + +At this point, register naming and parameter passing rules are thoroughly cleared up. Seeing rax then eax in assembly and getting confused comes from not knowing it is just slicing different widths of the same register. Understanding this brings peace of mind. Next, with this foundation, let's look at more complex assembly patterns. + +--- + +# RISC-V Register Naming—From Numbers to Semantics + +When reading RISC-V assembly, opening the disassembly window reveals a screen full of ``t0``, ``a7``, ``s1``, ``ra``. It looks similar to x86's ``rax``, ``rbx``, ``rcx``, seemingly a pile of letter abbreviations to memorize. But once you truly understand it, you realize RISC-V register naming isn't arbitrary abbreviation—it directly tells you what the register **should do**. Understanding the calling convention semantics behind the naming allows you to deduce these names yourself. + +## Start with the Most Basic Numbers + +RISC-V has 32 general-purpose registers, numbered ``x0`` to ``x31``. Note, it is 32, not 31—``x0`` is indeed an existing register, but it is hardwired to 0; writing anything to it yields 0, reading it always yields 0. This design may seem superfluous at first, but when writing inline assembly, you find having a constant zero directly usable as an operand saves many `__PRES diff --git a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md index f605e80da..c40ba5dfd 100644 --- a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md +++ b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md @@ -1,5 +1,5 @@ --- -title: In-Depth Compiler Explorer Usage and AI Assistance +title: Deep Dive into Compiler Explorer and AI Assistance description: 'CppCon 2025 Talk Notes — C++: Some Assembly Required by Matt Godbolt' conference: cppcon conference_year: 2025 @@ -20,22 +20,22 @@ chapter: 2 order: 3 translation: source: documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md - source_hash: 998efe1947a8d4eff43deb19a147424f2d005a97df292dd7e4053086daf01d76 - translated_at: '2026-05-26T11:16:07.588463+00:00' + source_hash: 5a14336bd024756b91e3e64d1885670a7ba5430d36c0640dcd128e7137290163 + translated_at: '2026-06-13T11:48:02.597949+00:00' engine: anthropic - token_count: 4887 + token_count: 4945 --- -# Reading Assembly with Compiler Explorer: From Gibberish to Comprehensible +# Reading Assembly with Compiler Explorer: From "Greek" to "Intelligible" -Many C++ developers have an instinctive aversion to reading assembly, assuming it is something only compiler theory courses or low-level engineers need to deal with. However, when template error messages become incomprehensible, performance optimization hits a wall, or the ``inline`` keyword seems to have no effect, learning to read assembly is no longer optional—it becomes a necessary skill. Among the many tools available, Compiler Explorer (commonly known as godbolt) is one of the most practical starting points. This section introduces a method for reading assembly from scratch, aiming to help readers transition from "completely lost" to "able to spot the patterns." +Many C++ developers have an instinctive resistance to reading assembly, viewing it as something relevant only to compiler theory courses or low-level engineers. However, when template error messages become incomprehensible, performance optimization hits a wall, or the `[[likely]]` attribute seems to have no effect, learning to read assembly is no longer optional—it becomes a necessary skill. Among the many tools available, Compiler Explorer (commonly known as godbolt) is one of the most practical entry points. This section introduces a method for reading assembly from scratch, aiming to help readers transition from "completely lost" to "able to see the patterns." ## Environment: Toolchain Configuration -Before we begin, let's clarify the experimental environment used in this article so readers can reproduce the results. We use Chrome to open ``godbolt.org``, select GCC 16.1.1 as the compiler, and default to the ``-O0`` optimization level (to observe the logical mapping from code to assembly). When we need to inspect optimization effects, we switch to ``-O2`` or ``-O3``, and set the language standard to C++20. Since godbolt uses a split-pane layout (C++ source on the left, assembly output on the right), we recommend using a 1920x1080 or higher resolution screen to prevent the assembly area from being squeezed and affecting readability. +Before we begin, let's outline the experimental environment used here so you can reproduce it. Open Chrome to visit godbolt.org, select GCC 16.1.1 as the compiler, and set the optimization level to `-O0` by default (to observe the logical mapping from code to assembly). Switch to `-O2` or `-O3` when checking optimization effects, and select C++20 as the language standard. Since godbolt uses a split-pane layout (C++ source on the left, assembly output on the right), a screen resolution of 1920x1080 or higher is recommended to prevent the assembly area from being squeezed and affecting readability. -## Core Idea: Assembly Correspondence +## Core Concept: The Assembly Correspondence -A common misconception when reading assembly is trying to read every instruction from start to finish, attempting to understand each line as if it were source code. In reality, the core purpose of reading assembly is to establish "correspondence"—finding out which machine instructions each line of C++ code is translated into. Readers do not need to understand the meaning of every assembly instruction; they only need to be able to locate "the few lines of assembly that correspond to this line of C++." +A common misconception when reading assembly is trying to understand every single instruction sequentially, just like reading source code. In reality, the core purpose of looking at assembly is to establish a "correspondence"—finding which machine instructions each line of C++ code is translated into. You don't need to understand the meaning of every assembly instruction; you only need to be able to locate "where those few lines of assembly corresponding to this line of C++" are. Let's take a simple square function as an example: @@ -45,206 +45,191 @@ int square(int x) { } ``` -Pasting this code into godbolt, for readers just starting to learn assembly reading, we recommend checking Directives, Labels, and Comments in the Filter options to get more complete information. Under ``-O0``, you will see output similar to this: +Putting this code into godbolt, for those just starting to learn assembly, I suggest checking Directives, Labels, and Comments in the Filter options to get more complete information. Under `-O0`, you will see output similar to this: ```asm -// GCC 16.1.1, -O0 -std=c++20 (AT&T 语法) square(int): - pushq %rbp - movq %rsp, %rbp - movl %edi, -4(%rbp) - movl -4(%rbp), %eax - imull %eax, %eax - popq %rbp - ret + push rbp + mov rbp, rsp + mov DWORD PTR [rbp-4], edi + mov eax, DWORD PTR [rbp-4] + imul eax, DWORD PTR [rbp-4] + pop rbp + ret ``` -Under ``-O0``, the compiler's behavior is very straightforward: it first stores the parameter from ``edi`` (the first integer argument register on x86-64) onto the stack at ``-4(%rbp)``, then reads it from the stack to perform multiplication, and finally leaves the result in ``eax`` (the return value register). Among these, ``pushq %rbp`` / ``movq %rsp, %rbp`` form the function prologue, and ``popq %rbp`` / ``ret`` form the function epilogue. These are fixed patterns present in every function, and once familiar with them, you can quickly skip over them. The truly core operations are only the middle three lines: store parameter, load parameter, multiply. +Under `-O0`, the compiler's behavior is very straightforward: it first stores the parameter from `edi` (the first integer argument register in x86-64) onto the stack at `[rbp-4]`, then reads it back from the stack to perform the multiplication, and finally leaves the result in `eax` (the return value register). `push` / `mov` is the function prologue, and `pop` / `ret` is the epilogue; these are fixed patterns present in every function that you can quickly skip once familiar. The truly core operations are just the three middle lines: store parameter, load parameter, multiply. -If we switch the optimization level to ``-O2``, the code generated by GCC 16.1.1 is ``imull %edi, %edi; movl %edi, %eax; ret``—it first multiplies ``edi`` by itself, then moves the result into the return value register ``eax``, which is very concise. Although it is not strictly a single instruction (it requires ``movl`` to move the result from ``edi`` to ``eax``), the core computation is indeed just one ``imul`` instruction. +If you switch the optimization level to `-O2`, the code generated by GCC 16.1.1 is `imul eax, edi, edi`—multiplying `edi` by itself and then moving the result into the return value register `eax`. It is very concise. Although not strictly a single instruction (requiring `mov` to move the result from `edi` to `eax`), the core computation is indeed just one `imul`. -One point to note here: when reading assembly, always rely on the actual compiler output rather than inferring from memory. The output from different compiler versions and different optimization levels can vary significantly, and manual verification is the key step to avoid misjudgments. +A reminder here: when reading assembly, always rely on the actual compiler output rather than memory or inference. Output can vary significantly between different compiler versions and optimization levels; manual verification is a key step to avoid misjudgment. -## Hands-on Practice: Analyzing a Real Function +## Hands-on: Analyzing a Real Function -Next, let's look at a slightly more complex example. The following function checks whether a ``std::string_view`` is a valid hexadecimal identifier, where the identifier length is fixed at 16 characters, and each character can only be ``0-9`` or ``A-F``: +Next, let's look at a slightly more complex example. The following is a function that checks if a `std::string_view` is a valid hexadecimal identifier. The identifier length is fixed at 16 characters, and each character can only be `0-9` or `A-F`: ```cpp -#include - -bool is_valid_hex_id(std::string_view sv) { - if (sv.size() != 16) - return false; - for (char c : sv) { - if (c >= '0' && c <= '9') continue; - if (c >= 'A' && c <= 'F') continue; - return false; +bool is_hex_id(std::string_view s) { + if (s.size() != 16) return false; + for (char c : s) { + if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) { + return false; + } } return true; } ``` -This implementation is clearly not optimal—we could use ``std::all_of``, a lookup table, or ``find_first_not_of`` to improve it. But here we deliberately use the most straightforward approach to observe how the compiler translates logic containing branches and loops. +This implementation is obviously not optimal—one could use `std::all_of`, a lookup table, or `switch` to improve it. But here, the most straightforward approach is used intentionally to observe how the compiler translates logic containing branches and loops. -Pasting this code into godbolt, the assembly under ``-O0`` will be quite long, so we won't list it all here. The key technique is: hover the mouse over a specific line of C++ code (such as ``if (sv.size() != 16)``), and the corresponding instructions in the assembly on the right will highlight; conversely, hovering over a line of assembly will highlight the corresponding C++ code on the left. This hover-highlight feature is one of godbolt's most practical features, as it directly solves the core problem of "finding the correspondence between C++ code and assembly instructions." +Putting this code into godbolt, the assembly under `-O0` will be quite long, so I won't list it all here. The key technique is: hover your mouse over a specific line of C++ code (e.g., `if (s.size() != 16)`), and the corresponding instructions in the assembly on the right will highlight; conversely, hovering over a line of assembly will highlight the corresponding C++ code on the left. This hover-highlight feature is one of godbolt's most practical capabilities; it directly solves the core problem of "finding the correspondence between C++ code and assembly instructions." -Under ``-O0``, the call to ``sv.size()`` is expanded into a set of instructions (because the ``size()`` of ``string_view`` is inline, which essentially means reading a member variable), and then compared with 16. If they are not equal, it jumps to the location that returns ``false``. The two ``if`` calls in the loop body are similar, with each condition check corresponding to a set of compare and jump instructions. The characteristic of ``-O0`` assembly is "faithful to the point of clumsiness": every C++ operation is translated literally, storing variables to the stack when they should be stored, and reading from the stack when they should be read. +Under `-O0`, the call to `s.size()` is expanded into a sequence of instructions (because `std::string_view`'s `size()` is `inline`, essentially reading a member variable), which is then compared with 16. If they are not equal, it jumps to the location returning `false`. The two conditions inside the loop body are similar; each conditional judgment corresponds to a set of comparison and jump instructions. The characteristic of `-O0` assembly is "faithful to the point of clumsiness": every C++ operation is translated faithfully, variables are stored to the stack if needed, and read from the stack if needed. ## Switching to -O2 to Observe Compiler Optimization -After switching the optimization level to ``-O2``, the assembly code shrinks significantly. The compiler does multiple things: the function prologue and epilogue may be simplified, the loop may be unrolled or optimized, and branches may be rearranged. Specifically in this example, the compiler inlines the call to ``size()``, directly compares the length, and the loop body is processed in a completely different way than under ``-O0``. +After switching the optimization level to `-O2`, the assembly code shortens significantly. The compiler does a lot of work: function prologues and epilogues may be simplified, loops may be unrolled or optimized, and branches may be rearranged. Specifically in this example, the compiler will inline the `s.size()` call, directly compare the length, and the loop body's handling will be completely different from under `-O0`. -We recommend that readers try this themselves in godbolt, as the output may differ across compiler versions and optimization levels. An important principle when reading assembly is: always rely on the actual compiler output, and do not jump to conclusions about uncertain results—let the compiler's output speak for itself. +I encourage readers to try this personally in godbolt, as output may differ between compiler versions and optimization levels. An important principle when reading assembly is: take the actual compiler output as the truth; don't jump to conclusions about uncertain results—let the compiler's output speak for itself. -## Common Pitfalls and Notes +## Common Questions and Considerations -In the process of reading assembly, there are a few common issues worth noting. First, godbolt filters out some assembly instructions by default through the Filter options. At the beginner stage, we recommend turning off all filters to view the complete output, and only enabling filters once you are familiar with which information constitutes "noise." Second, you need some understanding of the x86-64 calling convention—at a minimum, you should know that integer arguments are passed in order through the ``rdi``, ``rsi``, ``rdx``, ``rcx``, ``r8``, and ``r9`` registers, and the return value is placed in ``rax``. You don't need to deliberately memorize these conventions; you will naturally remember them after reading enough assembly. Third, while the position of parameters in simple functions can usually be inferred, if the function logic is complex and registers are repeatedly reused, you cannot rely on guessing—you need to diligently track the data flow. +There are a few common issues worth noting when reading assembly. First, godbolt filters out some assembly instructions by default via the Filter options. In the beginner stage, I suggest turning off all filters to see the full output, and only turn filters on once you are familiar with what information counts as "noise." Second, you need some understanding of the x86-64 calling convention—at least know that integer arguments are stored sequentially in the `rdi`, `rsi`, `rdx`, `rcx`, `r8`, and `r9` registers, and the return value is in `rax`. You don't need to memorize these deliberately; you will naturally remember them after reading enough assembly. Third, while parameter positions in simple functions can be inferred, if the function logic is complex and registers are reused heavily, you cannot rely on guessing; you must track the data flow diligently. -Once you have mastered the correspondence between C++ and assembly, godbolt's hover-highlight feature lowers the learning barrier to a minimum. Later, you can try using this method to analyze more complex scenarios—the code shape after template instantiation, the degree to which ``constexpr`` functions are optimized, and differences in ``std::string`` across different standard library implementations. These are the scenarios where reading assembly truly delivers value. +Once you have mastered the correspondence between C++ and assembly, godbolt's hover-highlight feature lowers the learning barrier to the minimum. You can subsequently try using this method to analyze more complex scenarios—the form of code after template instantiation, the degree to which `constexpr` functions are optimized, and differences in `std::string` implementations across different standard libraries. These are the scenarios where reading assembly truly provides value. --- -# Seeing string_view's True Colors Through Assembly +# Reading the True Face of string_view from Assembly -When faced with a large block of assembly output, many developers instinctively want to close the window. But in reality, once you understand "what the compiler is doing," assembly is not that intimidating. This section discusses a very specific scenario: what actually happens at the low level when passing a ``std::string_view`` by value to a function. +Faced with a large block of assembly output, many developers instinctively want to close the window. But in reality, once you understand "what the compiler is doing," assembly isn't that terrifying. This section discusses a very specific scenario: what actually happens at the low level when passing a `std::string_view` by value to a function. -First, let's clarify the experimental environment: GCC 16.1.1, running on x86-64 Linux, with libstdc++ as the standard library, and the optimization level set to O1. Why not O0? Because O0's output is too literal—if you write ``int x = 0; return x;``, the compiler will literally write 0 to memory first, then read it back from memory into the return value register. While this is friendly for debugging, if the goal is to understand the logical flow of the code, O0's output is actually a distraction: the screen is full of meaningless stack operations, and "not seeing the forest for the trees" perfectly describes this situation. O1 is much better—redundancy has been eliminated, but it hasn't reached the aggressive inlining and transformations of O2, making it ideal for the learning stage. +First, the experimental environment: GCC 16.1.1, running on x86-64 Linux, standard library is libstdc++, optimization level `-O1`. Why not `-O0`? Because `-O0` output is too literal—if you write `return 0`, the compiler will actually write 0 to memory first, then read it back into the return value register. While this is friendly for debugging, if the goal is to understand the logical flow of the code, `-O0` output is actually interference: the screen is full of meaningless stack operations—"seeing the trees but not the forest." `-O1` is much better; redundancy is eliminated, but it hasn't reached the level of aggressive inlining and transformation seen in `-O2`, making it suitable for the learning stage. -Let's look at a simple test code snippet: +Let's look at a simple piece of test code: ```cpp -#include - -bool check_length(std::string_view sv) { - if (sv.size() == 16) { - // 做一些更复杂的事情 - return true; - } - return false; +bool check_len(std::string_view s) { + return s.size() == 16; } ``` -The function itself is very simple. We use ``g++ -O1 -S -o - test.cpp`` to output the assembly for analysis. A common question is: isn't ``std::string_view`` just a "read-only view of a string"? What's the difference between it and ``const std::string&``? This question becomes very concrete after looking at the assembly. +This function is very simple in itself. We use `-O1` to output assembly for analysis. A common question is: isn't `std::string_view` just a "read-only view of a string"? What's the difference from `std::string`? After looking at the assembly, this question becomes very concrete. -Under the hood, ``string_view`` has only two members: a pointer (pointing to the character data) and a ``size_t`` (representing the length). Essentially, it is just a struct with two members. A common misconception is that when passing a struct to a function, no matter how small, it will always be placed on the stack, or the compiler will implicitly convert it to pass-by-reference. This is not the case. The System V ABI for x86-64 (the C/C++ function calling convention on Linux) specifies that if a struct's total size can fit into two registers, and each member is a "simple type" (pointer, integer, etc.), it can be passed directly through registers, exactly like passing two ordinary variables. +Underlying `std::string_view` are only two members: a pointer (pointing to character data) and a `size_t` (representing the length). Essentially, it is just a struct with two members. A common misconception is that when passing a struct to a function, regardless of how small it is, it will be placed on the stack, or the compiler will implicitly convert it to pass-by-reference. This is not true. The x86-64 System V ABI (the convention for C/C++ function calls on Linux) stipulates that if a struct's total size fits in two registers and each member is a "simple type" (pointer, integer, etc.), it can be passed directly via registers, exactly like passing two ordinary variables. -It is worth noting that the member layout of ``string_view`` may differ across standard library implementations. GCC's libstdc++ puts ``size_t`` first (``{size_t _M_len; const char* _M_str;}``), so when the function is entered, **the length portion is in ``RDI``, and the pointer portion is in ``RSI``**. This is the opposite of the "pointer first" intuition found in many documents. Clang's libc++, on the other hand, is ``{const char* __data; size_t __size;}``, with the pointer first. The assembly output in this article is based on GCC/libstdc++; if readers use Clang/libc++, the register allocation will be reversed. +Note that the member layout of `std::string_view` may differ across standard library implementations. GCC's libstdc++ puts `size` first (`_M_len`), so when the function is entered, **the length part is in `rsi` and the pointer part is in `rdi`**. This is the opposite of the intuition many documents have that "pointer comes first." Clang's libc++ is the opposite, with the pointer first. The assembly output here is based on GCC/libstdc++; if you use Clang/libc++, the register allocation will be reversed. -The corresponding assembly output is as follows (GCC 16.1.1, ``-O1 -std=c++20``, with ``.cfi_*`` instructions and irrelevant labels removed): +The corresponding assembly output is as follows (GCC 16.1.1, `-O1`, with `nop` instructions and irrelevant labels removed): ```asm -// GCC 16.1.1, -O1 -std=c++20 -check_length(std::string_view): - cmpq $16, %rdi ; 比较 size(在 RDI 中)是否等于 16 - sete %al ; 相等则 AL=1,否则 AL=0 +check_len(std::basic_string_view >): + cmp esi, 16 + sete al ret ``` -GCC optimizes this logic very cleanly even at O1: ``cmpq $16, %rdi`` compares the immediate value 16 with the value in the ``RDI`` register. Since the first member of ``string_view`` in libstdc++ is ``size_t _M_len`` (placed in the first integer argument register ``RDI`` per the System V ABI), ``RDI`` holds ``sv.size()``. Next, ``sete %al`` is a clever instruction—if the result of the previous comparison is "equal," it sets ``%al`` to 1, otherwise to 0. This directly produces the ``bool`` return value (0 is ``false``, 1 is ``true``), completely without any branch jumps. +GCC optimizes this logic very cleanly at `-O1`: `cmp esi, 16` compares the immediate value 16 with the value in the `esi` register. Since in libstdc++, the first member of `std::string_view` is `size` (placed in the first integer argument register `rsi` according to the System V ABI), `rsi` holds the length. Next, `sete al` is a clever instruction—if the result of the previous comparison is "equal," it sets `al` to 1, otherwise to 0. This directly produces the `bool` return value (0 is `false`, 1 is `true`), completely without branches. -It is worth noting that GCC chose the ``sete`` branchless approach, rather than the more intuitive branching pattern of "compare → jump if not equal → set return values separately." This shows that even at O1 (a not particularly aggressive optimization level), the compiler will prioritize branch elimination strategies—the penalty of a branch misprediction is typically much higher than a few straight-line instructions. +It is worth noting that GCC chose this branchless method (`sete`) rather than the more intuitive branch pattern of "compare → jump if not equal → set return value separately." This shows that even at `-O1` (not a very aggressive optimization level), the compiler will prioritize strategies that eliminate branches—the cost of a branch prediction failure is usually much higher than a few straight-line instructions. -Another detail worth paying attention to: when analyzing more complex functions, if you scroll down in the assembly, you may find that the highlight colors suddenly disappear—the correspondence between source code and assembly breaks. This is not a browser rendering issue, but rather because the function internally calls STL helper functions (such as member functions of ``string_view``), which the compiler inlines under O1 optimization. After inlining, this code no longer corresponds to any line of user-written source code, so the highlight correspondence breaks. +Another detail worth attention: when analyzing more complex functions, if you scroll down in the assembly, you may find the highlight colors suddenly disappear—the correspondence between source code and assembly breaks. This isn't a browser rendering issue, but because the function internally calls STL helper functions (e.g., member functions of `std::string_view`), which the compiler inlines at `-O1` optimization. After inlining, this code no longer corresponds to any line of user-written source code, so the highlighting correspondence breaks. -This is a great learning point: inlining doesn't only happen when you manually write the ``inline`` keyword. The compiler will inline small functions at O1 based on its own judgment (especially functions defined in STL headers), directly expanding them at the call site. After expansion, the assembly becomes longer, but the function call overhead is eliminated, and the compiler gains more context for further optimizations. In the future, when reading assembly, if you notice the highlight correspondence suddenly breaking, your first reaction should be: inlining most likely happened here. +This is a good learning point: inlining doesn't always require manually writing the `inline` keyword. The compiler will inline small functions at `-O1` based on its own judgment (especially functions defined in headers within the STL). After inlining, the assembly becomes longer, but the function call overhead is eliminated, and the compiler gains more context for further optimization. In the future, when reading assembly, if you find the highlight correspondence suddenly breaks, your first reaction should be: inlining probably happened here. -To summarize this section's analysis: ``string_view`` is a struct with two members, and when passed by value, it is passed through registers (under GCC/libstdc++, ``RDI`` is the length and ``RSI`` is the pointer). The ``size()`` check corresponds to a single ``cmp`` instruction, and GCC uses ``sete`` to return the result branchlessly at O1. The key is to correlate "ABI conventions" with "standard library member layouts"—different STL implementations can lead to completely different register allocations, so always rely on the actual compiler output. +To summarize this section's analysis: `std::string_view` is a struct with two members. When passed by value, it is passed via registers (in GCC/libstdc++, `rsi` is length, `rdi` is pointer). The `s.size() == 16` check corresponds to a `cmp` instruction, and GCC returns the result branchlessly at `-O1` using `sete`. The key is to map "ABI conventions" and "standard library member layout" together—different STL implementations can lead to completely different register allocations, so always rely on the actual compiler output. --- -# Dissecting find_first_not_of Assembly Optimization Level by Level in Compiler Explorer +# Disassembling find_first_not_of by Optimization Level in Compiler Explorer -Many C++ developers treat ``std::string::find_first_not_of`` as a black box—pass in parameters, get the return value, and never care about what the compiler expands it into. But by switching the optimization level from O0 to O3 step by step in Compiler Explorer and observing the results, we can see that the compiler's handling of this function varies significantly across different optimization levels. +Many C++ developers treat `std::string::find_first_not_of` as a black box—pass parameters, get a return value, never caring what the compiler compiles it into. But by switching optimization levels from `-O0` to `-O3` step-by-step in Compiler Explorer, we can see significant differences in how the compiler handles this function at different optimization levels. ## Experimental Environment -The experiment uses Compiler Explorer (godbolt.org), with GCC 16.1.1 as the compiler, targeting the x86-64 architecture, and using libstdc++ as the standard library. The test code is simple: given a hexadecimal string, find the first position that does not belong to the "0123456789ABCDEF" character set. +The experiment uses Compiler Explorer (godbolt.org), compiler GCC 16.1.1, target architecture x86-64, standard library libstdc++. The test code is simple: given a hexadecimal string, find the position of the first character that does not belong to the "0123456789ABCDEF" character set. ```cpp -#include - -int find_non_hex(const std::string& s) { - // 找第一个不是十六进制字符的位置 - // 如果全是合法十六进制字符,返回 std::string::npos - return static_cast(s.find_first_not_of("0123456789ABCDEF")); +size_t find_first_hex_invalid(std::string_view s) { + return s.find_first_not_of("0123456789ABCDEF"); } ``` -This function looks unremarkable, but the compiler's handling of it varies greatly across different optimization levels. +This function looks plain, but the compiler's handling of it varies greatly across different optimization levels. -## At O1: The Appearance of a memchr Call +## Under -O1: The Appearance of memchr -Opening the assembly view at O1 optimization, the first noteworthy phenomenon is that Compiler Explorer does not display the inlined expansion of STL source code by default, so all code inside the standard library appears in white (with no source code highlight correspondence), and you can only see raw assembly instructions. +Opening the assembly view under `-O1` optimization, the first phenomenon worth noting is: Compiler Explorer does not display STL source code inlining by default, so internal standard library code is all white (no source code highlighting correspondence), and only bare assembly instructions are visible. -Even more surprisingly, a call to ``memchr`` appears in the middle of the assembly. The source code clearly calls ``find_first_not_of``—"find the first character not in the set"—so what does this have to do with ``memchr`` ("find the first occurrence of a specific byte")? +Even more surprisingly, a call to `memchr` appears in the middle of the assembly. The source code clearly calls `find_first_not_of`—"find the first character not in the set." What does this have to do with `memchr` ("find the first occurrence of a specific byte")? -Upon careful thought, the logic actually makes perfect sense: the most direct way to determine if a character is "not in" a set is to call ``memchr`` once for each element in the set. If none of them are found, then the character is indeed not in the set. The parameter string "0123456789ABCDEF" happens to be exactly 16 characters, so the compiler's implementation becomes querying "is this character in the input string?" separately for each candidate character. +After thinking carefully, the logic is actually quite smooth: to determine if a character is "not in" a set, the most direct way is to call `memchr` for each element in the set. If `memchr` doesn't find any of them, then the character is indeed not in the set. The parameter string "0123456789ABCDEF" happens to be 16 characters long, so the compiler's implementation becomes querying "is this character in the input string" for each candidate character. -## At O2: Finding Loop Structures and Vectorization +## Under -O2: Looking for Loop Structures and Vectorization -After switching to O2, the amount of assembly code decreases somewhat, but the overall structure remains basically the same as O1. There are some boundary checks and preprocessing at the beginning, and the core logic still revolves around ``memchr``. +After switching to `-O2`, the amount of assembly code is reduced somewhat, but the overall structure remains basically consistent with `-O1`. There are some boundary checks and preprocessing at the beginning, and the core logic still revolves around `memchr`. -When analyzing compiler output, an effective strategy is to first locate loop structures. The specific method is to look for the pattern of a label followed by a backward jump instruction—for example, after the ``.L4:`` label, there is a ``jne .L4`` at the end of the loop body, which constitutes a complete loop. This method is particularly important when determining whether vectorization optimization is being used (whether SIMD instructions are being used): by observing how many bytes the pointer advances per iteration and how many elements are processed at once, you can determine whether the compiler has transformed it into SIMD instructions. +When analyzing compiler output, an effective strategy is to first locate loop structures. The specific method is to look for the pattern of a label plus a backward jump instruction—for example, after a `.L` label, if there is a `jmp` or `jne` at the end of the loop body, that constitutes a complete loop. This method is particularly important when judging vectorization optimizations (whether SIMD instructions are used): by observing how many bytes the pointer advances per iteration in the loop and how many elements are processed at once, we can judge if the compiler has transformed it into SIMD instructions. -However, in the O2 output of this example, there is no such loop structure. The compiler does not "use a loop to iterate over each character of the input string," but instead repeatedly calls ``memchr``. Intuitively, ``find_first_not_of`` should iterate over the input string and check whether each character is in the set; but the logic presented in the assembly is exactly the opposite—for each character in the set, it searches the input string. These two directions differ greatly in algorithmic complexity, but in this specific scenario (where the set has only 16 elements), the compiler chose the latter. +However, in the `-O2` output of this example, there is no such loop structure. The compiler didn't "use a loop to iterate through every character of the input string," but rather repeatedly calls `memchr`. Intuitively, `find_first_not_of` should iterate through the input string and check if each character is in the set; but the logic presented in assembly is exactly the opposite—for each character in the set, it looks it up in the input string. The algorithmic complexity of these two directions is very different, but in this specific scenario (the set has only 16 elements), the compiler chose the latter. -## At O3: The Loop Disappears, Fully Unrolled +## Under -O3: The Loop Disappears, Fully Unrolled -After switching to O3, the loop structure disappears entirely, replaced by ``memchr`` calls being heavily duplicated—sixteen nearly identical ``memchr`` call sequences are laid out flat in the assembly. +After switching to `-O3`, the loop structure disappears completely, replaced by the call to `memchr` being duplicated a massive amount—sequences of nearly identical `memchr` calls are laid out flat in the assembly 16 times. -The underlying logic is already clear when combined with the previous analysis. For each character in the input string (the compiler now knows the string length is 16 because of the preceding length check), it separately queries: is this character in the range "0" to "9"? Is it in the range "A" to "F"? If all these checks answer "not found," then this character is definitely not in the valid hexadecimal character set, and it is the target position. +The underlying logic is already clear combined with the previous analysis. For each character in the input string (the compiler now knows the string length is 16 because of the length check), it queries separately: is this character in the range '0' to '9'? Is it in the range 'A' to 'F'? If all these checks answer "not found," then this character is definitely not in the valid hexadecimal character set, and it is the target position. -In other words, O3 fully unrolls the logic of "calling memchr once for each of the 16 candidate characters." There is no loop overhead, no indirect jumps from function calls—just 16 ``memchr`` calls lined up in a row. +In other words, `-O3` fully unrolls the logic of "calling memchr once for each of the 16 candidate characters." No loop overhead, no indirect jumps of function calls, just 16 `memchr` calls lined up in a row. -## A Noteworthy Cognitive Bias +## A Notable Cognitive Bias -Before reading this assembly, many people might assume that ``find_first_not_of`` is implemented by iterating over the input string and using some efficient method (such as a lookup table) to determine whether each character is in the set. This intuition might be correct when "the set is large," but when the set is small, libstdc++'s implementation takes a different path—reversing the problem to search the input for each character in the set. +Before reading this assembly, many might assume the implementation of `find_first_not_of` is: iterate through the input string and use some efficient method (like a lookup table) for each character to judge if it is in the set. This intuition might be right when "the set is large," but when the set is small, libstdc++'s implementation takes another path—reversing the problem to look up each character in the set in the input. This discovery illustrates an important fact: the actual implementation logic of the standard library may be completely different from intuition, and the only way to verify is to look directly at the assembly output. -To summarize ``find_first_not_of``'s behavior across different optimization levels: O1 introduces preliminary ``memchr`` calls, O2 maintains the same structure but trims redundancy, and O3 performs brute-force unrolling. At each level, the compiler does the transformation it considers "most cost-effective"—it's just that its standard for "cost-effective" doesn't necessarily align with human intuition. +To summarize the behavior of `find_first_not_of` at different optimization levels: `-O1` sees the initial appearance of `memchr` calls, `-O2` maintains the same structure but simplifies redundancy, and `-O3` performs brute-force unrolling. At each level, the compiler is doing the transformation it thinks is "most cost-effective," but the standard of "cost-effective" is not necessarily consistent with human intuition. --- -# Observing Clang's Different Loop Handling Strategies on Compiler Explorer +# Observing Clang's Different Processing Strategies for Loops on Compiler Explorer -Compiler optimization is often treated as a black box—turn on O2 or O3, and the generated code will be faster somehow, but exactly where it's faster is rarely a concern. However, by comparing the output of different optimization levels and different compiler versions in Compiler Explorer, we can see that the assembly shape of the same loop code varies enormously under different conditions. +Compiler optimization is often viewed as a black box—turn on `-O2` or `-O3`, and the generated code is faster, but we don't care much where specifically it's faster. But by comparing outputs from different optimization levels and compiler versions in Compiler Explorer, we can see that the assembly form of the same loop code varies significantly under different conditions. ## Test Environment -The experiment uses Compiler Explorer (godbolt.org), with Clang as the compiler, targeting the x86-64 architecture, and the CPU model set to skylake (a typical modern desktop architecture). The test code is a naive loop that calls ``memchr`` to scan a 16-byte buffer segment by segment, returning an error immediately upon finding an invalid character. The logic itself is not complex, but the compiler's handling of this code is worth studying in depth. +The experiment uses Compiler Explorer (godbolt.org), compiler Clang, target architecture specified as x86-64, CPU model selected as skylake (a typical modern desktop architecture). The test code is a naive loop that internally calls `std::char_traits::find` to scan a 16-byte buffer segment by segment, returning an error immediately if an invalid character is found. The logic itself isn't complex, but the compiler's handling of this code is worth deep study. -## A Correct Understanding of Loop Unrolling +## Correct Understanding of Loop Unrolling -A common misconception is that loop unrolling is simply a brainless copy-paste of the loop body N times, and that more unrolling is better, which is O3's advantage over O2. But the reality is not that simple. +A common misunderstanding is: loop unrolling is just blindly copying the loop body N times, the more unrolling the better, and the advantage of `-O3` over `-O2` lies here. But the reality is not that simple. -This loop has only 16 iterations, and the loop body contains a ``memchr`` call. If the compiler unrolls all 16 iterations, it means continuously generating 16 segments of code containing ``memchr`` calls and conditional jumps. Once all this code enters the instruction cache, it may actually cause performance degradation due to cache pressure. The compiler needs to balance "unrolling to reduce branch overhead" against "not blowing out the instruction cache," and this balance point is not easy to find. +This loop only has 16 iterations, and the loop body contains a call to `std::char_traits::find`. If the compiler unrolls all 16 times, it means generating 16 consecutive segments of code containing `std::char_traits::find` calls and conditional jumps. After all this code enters the instruction cache, performance might actually degrade due to cache pressure. The compiler needs to balance "unrolling to reduce branch overhead" and "don't blow up the instruction cache," and this balance point isn't easy to find. ## Comparing on Compiler Explorer -Paste the code into Compiler Explorer, first compile with Clang trunk (the latest development version), and compare O2 and O3. A noteworthy phenomenon is that the trunk version of Clang may not behave as expected. The aggressive unrolling behavior previously observed on a specific version may have become more "restrained" on trunk. +Paste the code into Compiler Explorer, first compile with Clang trunk (the latest development version), and compare `-O2` and `-O3`. A phenomenon worth noting is: the behavior of the trunk version of Clang might not be as expected. Aggressive unrolling behavior observed on a fixed version might have become more "restrained" on trunk. -Using the trunk version for experiments easily leads to irreproducibility issues, because new commits can change optimization strategies at any time. If you want to reproduce experimental results, we recommend locking to a specific version number, such as Clang 21, rather than using trunk. +Using the trunk version for experiments can easily lead to unreproducible problems, as new commits can change optimization strategies at any time. To reproduce experimental results, it is recommended to lock a specific version number, such as Clang 21, rather than using trunk. ## Analysis Results After Locking the Version -Switch the compiler to Clang 21, keep the target architecture as skylake, and enable O2. The assembly output this time is well worth studying. +Switch the compiler to Clang 21, target architecture remains skylake, enable `-O2`. This time the output assembly is very valuable for study. -First, the call to ``memchr`` disappears—it's not deleted, but inlined. The compiler embeds the core logic of ``memchr`` directly into the loop body, eliminating the function call overhead (pushing to the stack, jumping, returning). Then you'll see some rather complex instructions—not simple ``cmp`` plus ``je``, but AVX2-related vector comparison instructions. The compiler recognized that this code is doing byte scanning and directly used SIMD instructions to accelerate it, comparing multiple bytes at once. +First, the call to `std::char_traits::find` disappears—it's not deleted, but inlined. The compiler embeds the core logic of `std::char_traits::find` directly into the loop body, saving the function call overhead (pushing stack, jumping, returning). Then you see some complex instructions, not simple `cmp` plus `jmp`, but AVX2-related vector comparison instructions—the compiler recognizes this code is doing byte scanning and directly uses SIMD instructions to accelerate, comparing multiple bytes at once. -This discovery shows that Clang has special built-in knowledge of standard library functions: it understands the semantics of ``memchr``, and rather than treating it as an ordinary external function call, it can perform further transformations after inlining, including auto-vectorization. +This discovery shows that Clang has special built-in knowledge for standard library functions: it understands the semantics of `std::char_traits::find`, not treating it as a normal external function call, but can do further transformations after inlining, including automatic vectorization. -## A Detail Pending Confirmation +## A Detail to Be Confirmed -In the assembly output, notice a strange immediate number appearing in offset calculations or mask operations. The specific source of this number still needs further confirmation—it might be some kind of alignment-related mask, because when ``memchr`` handles unaligned starting addresses, it needs to process the unaligned head portion first, and then use vector instructions to process the aligned main body. Exactly how this constant is calculated needs to be verified against the implementation of ``memchr`` in glibc. +In the assembly output, notice a strange immediate number appearing in offset calculation or mask operations. The specific source of this number needs further confirmation—it might be some mask related to alignment, because `std::char_traits::find` needs to process the unaligned head part first when handling unaligned start addresses, and then use vector instructions for the aligned main body. Specifically how this constant is calculated needs to be verified against the implementation of `std::char_traits::find` in glibc. -However, this doesn't affect the core conclusion of this section: the transformations Clang applies to this code at O2 go far beyond simply "unrolling the loop a few times." It combines ``memchr`` inlining, vectorization, and possible loop strength reduction, generating code that looks completely different from the original C++ code but is semantically equivalent. +However, this doesn't affect the core conclusion of this section: the transformation Clang does on this code at `-O2` goes far beyond just "unrolling the loop a few times." It combines `std::char_traits::find` inlining, vectorization, and possible loop strength reduction. The generated code looks completely different from the original C++ code, but the semantics are equivalent. -## Notes +## Considerations -When switching compiler versions, note that Compiler Explorer's interface sometimes has caching issues—after switching, it may still be using the old version. We recommend checking the full compiler version string displayed in the top-left corner after each switch to confirm it has actually changed. Additionally, specifying ``-march=skylake`` is very important—if you don't specify it, the default is ``-march=x86-64``, and the compiler won't use AVX2 instructions, making the generated assembly much more naive and preventing you from observing the transformations described above. +When switching compiler versions, note that Compiler Explorer's interface sometimes has cache issues; after switching, it might still be using the old version. It is recommended to check the full compiler version string displayed in the top left corner after each switch to confirm it has actually switched. Also, specifying `-march=skylake` is very important—if not specified, the default is `-march=x86-64`, and the compiler won't use AVX2 instructions, making the generated assembly much more primitive and unable to observe the transformations mentioned above. -Through this experiment, we can see that the process of compiler loop optimization is no longer a complete black box—at the very least, we can observe what decisions it's making. Next, we'll continue to analyze more complex situations. +Through this experiment, we can see that the process of compiler loop optimization is no longer a complete black box—at least we can observe what decisions it is making. Next, we continue analyzing more complex situations. --- @@ -252,149 +237,115 @@ Through this experiment, we can see that the process of compiler loop optimizati # Using LLMs to Assist Reading Assembly in Compiler Explorer -The traditional way of reading assembly is usually counting instructions one by one—getting nervous when you see a loop, and skipping over instructions you don't recognize. This state of "half-understanding" exists among many developers. Compiler Explorer recently added a feature that submits assembly output to an LLM for it to help explain. This section introduces the experience of using this feature, while also discussing how to systematically read assembly without AI assistance. +Traditional assembly reading is usually instruction by instruction—nervous when seeing loops, skipping when encountering unknown instructions. This state of "half-understanding" exists among many developers. Compiler Explorer recently added a feature: submitting assembly output to an LLM to let it assist in explanation. This section introduces the experience of using this feature and also discusses how to systematically read assembly without AI assistance. ## Experimental Environment -The experiment uses Chrome to open Compiler Explorer (godbolt.org), with GCC 16.1.1 as the compiler, optimization level -O2, and the C++20 language standard. The assembly generated under different compilers and optimization levels varies greatly, so the results readers see may not be identical to this article, but the overall approach is the same. +The experiment uses Chrome to open Compiler Explorer (godbolt.org), compiler GCC 16.1.1, optimization level `-O2`, language standard C++20. Generated assembly varies greatly under different compilers and optimization levels, so what readers see might not be exactly the same as here, but the overall approach is similar. ## Starting with an Unfamiliar Instruction -When analyzing some bit-manipulation-related code, an uncommon instruction appeared in the compiler output. Hovering the mouse over it, Compiler Explorer's tooltip was very vague, only stating that it "looks very much like a bitmask," but offering no explanation of what it actually does. +When analyzing a piece of bit operation related code, an uncommon instruction appeared in the compiler output. Hovering the mouse over it, Compiler Explorer's tooltip was very vague, only stating it "looks very much like a bitmask," but explaining absolutely nothing about what it specifically does. -Compiler Explorer's hover tooltips are very useful for common instructions (``mov``, ``add``, ``cmp``, etc.)—clicking shows the corresponding source code line. But for this particular instruction, the tooltip was almost empty, or just a very generic description that was no help in understanding the actual logic. +Compiler Explorer's hover tooltips are very useful for common instructions (`mov`, `cmp`, `jmp`, etc.), clicking to see the corresponding source line. But the instruction encountered this time, the tooltip was almost empty, or just a very generic description, which was no help in understanding the actual logic. -Faced with this situation, you can try repeatedly adjusting the compiler's optimization level—switching from -O0 to -O1 and then to -O2, observing whether this instruction transforms into a more understandable form at different optimization levels. In this example, at -O0 it became a more verbose but straightforward instruction sequence, and at -O2 it was folded back into that single incomprehensible instruction. This provided an important clue: this instruction is likely the compiler "compressing" a certain piece of logic into a single processor-native bit manipulation instruction at higher optimization levels. +Facing this situation, you can try repeatedly adjusting the compiler's optimization level—from `-O0` to `-O1` to `-O2`, observing whether this instruction becomes a more understandable form at different optimization levels. In this example, under `-O0` it turned into a much longer but more straightforward instruction sequence, and under `-O2` it was folded back into that single unintelligible instruction. This provides an important clue: this instruction is likely the compiler "compressing" a certain logic into a processor-native bit operation instruction at a higher optimization level. ## Assembly Reading Method Without AI Assistance Without AI assistance, you can build an overall understanding of the assembly output through the following steps. -First, turn off visually distracting display items. Compiler Explorer shows a lot of information by default—instruction addresses, opcode byte representations, source line number annotations, and so on. These are very useful for debugging, but if the goal is to "understand what this code is doing," they actually make the screen cluttered. We recommend turning off "Show instruction addresses" and "Show machine code" in the settings, keeping only the instruction mnemonics and the highlight correspondence with source line numbers. +First, turn off distracting display items. Compiler Explorer displays a lot of information by default—instruction addresses, opcode byte representations, source code line number annotations, etc. These are useful when debugging, but if the goal is "understanding what this code is doing," they just make the screen cluttered. It is recommended to turn off "Show instruction addresses" and "Show machine code" in settings, keeping only instruction mnemonics and the highlighting correspondence of source line numbers. -Then, count the loops. This is the fastest way to build assembly intuition. When you see ``jmp`` jumping backward, you know there's a loop here; when you see ``call``, you mark an external function call here; when you see ``ret``, you know this is the end of a function. Through this approach, even without recognizing every instruction, you can make a rough judgment about the code's structure: are there any unexpected loops? Are there calls to unknown functions? How large is the function's stack frame roughly? +Then, count loops. This is the fastest way to build assembly intuition. Seeing `jmp` jumping back, you know there is a loop here; seeing `call`, you mark that an external function is called here; seeing `ret`, you know this is the end of the function. In this way, even without knowing every instruction, you can make a rough judgment of the code structure: are there unexpected loops? Are there calls to unknown functions? How big is the function's stack frame roughly? -Returning to that incomprehensible instruction. An effective strategy is to switch to a different compiler—for example, from GCC to Clang 18, keeping the same source code and optimization level. In Clang's generated assembly, the same logic might use a different instruction sequence. While it still might not be immediately understandable, at least the hover tooltip for each instruction might be more detailed. When you're stuck on a particular instruction, comparing with a different compiler often opens up new perspectives—different compilers have different "translation styles" for the same C++ code, and when compiler A uses an instruction you can't understand, compiler B might express the same logic in a more straightforward way. +Back to that unintelligible instruction. An effective strategy is to switch compilers—for example, from GCC to Clang 18, keeping the same source code and optimization level. In Clang's generated assembly, the same logic might use a different instruction sequence. Although still not instantly understandable, at least the hover tooltip for each instruction might be more detailed. When stuck on a certain instruction, switching compilers to compare often opens up ideas—different compilers have different "translation styles" for the same C++ code; if compiler A uses an instruction you don't understand, compiler B might use a more straightforward way to express the same logic. ## Confirming the Meaning of the BT Instruction -Returning to GCC's output, hover the mouse over that instruction again, and the tooltip reveals it is the ``BT`` instruction, short for "Bit Test," which selects and tests a single bit in a bit string. +Returning to GCC's output, re-hover the mouse over that instruction. The tooltip information shows this is the `bt` instruction, short for "Bit Test," which selects a bit in a bit string for testing. -Understanding this explanation, the logic of the entire assembly block clicks into place. The C++ source code does indeed have a bit test operation like ``(1ULL << n) & mask``, and the compiler at -O2 directly mapped it to the x86 ``BT`` instruction, rather than actually doing a shift and then an AND operation. This is a classic compiler optimization: recognizing a bit manipulation pattern in the source code and replacing it with a processor-native instruction, which both reduces the instruction count and improves execution speed. +Understanding this explanation, the logic of the entire assembly passage becomes clear. The C++ source code indeed has a bit test operation like `if (flags & (1UL << n))`. Under `-O2`, the compiler maps it directly to the x86 `bt` instruction, rather than actually doing a shift and then an AND operation. This is a classic compiler optimization: recognizing a bit operation pattern in the source code and replacing it with a processor-native instruction, which both reduces the number of instructions and increases execution speed. -This illustrates an important principle: reading assembly doesn't require recognizing every instruction—you only need to grasp the key ones, figure out which source code operation they correspond to, and just glance at the rest of the filler instructions (such as stack frame setup and teardown, parameter passing). +This illustrates an important principle: reading assembly doesn't require knowing every instruction, just grabbing the key few, figuring out which operation in the source code they correspond to, and glancing over the rest of the filler instructions (like stack frame setup and teardown, parameter passing). ## Compiler Explorer's LLM Explanation Feature -Compiler Explorer recently added an option in its interface that submits both the source code and the corresponding assembly output to an LLM, asking it to explain "what's happening here." +Compiler Explorer recently added an option in the interface to submit source code and corresponding assembly output to an LLM together, letting it explain "what happened here." -The LLM's explanation approach is not a line-by-line instruction translation—if it did that, there would be no fundamental difference from manual reading. Instead, it does something more valuable: it divides the assembly into several logical blocks and describes the function of each block. For example, it might point out "this is doing pre-loop initialization," "this is a loop body that checks one bit per iteration," or "this is collecting results." This kind of high-level summarization is precisely what's easily overlooked when reading assembly manually—developers tend to get bogged down in the details of individual instructions and forget to step back and look at the overall structure. +The LLM's way of explaining is not translating instruction by instruction—if it did that, there would be no essential difference from manual reading. It does something more valuable: it divides the assembly into several logical blocks and describes the function of each block. For example, it might point out "this is doing initialization before the loop," "this is a loop body, checking one bit per iteration," "this is collecting results." This high-level summary is exactly what manual assembly reading easily misses—developers often get bogged down in the details of individual instructions and forget to step back and look at the overall structure. -## Caveats for Using the LLM Feature +## Considerations for Using the LLM Feature -Although the experience of LLM-assisted explanation is quite good, there are a few key points that need special attention. +Although the experience of AI-assisted explanation is good, there are a few key points to pay special attention to. -First, this feature is currently in beta. The speaker explicitly stated that if it proves too costly or misleading, it may be taken offline. Therefore, don't over-rely on it—just treat it as an auxiliary tool. +First, this feature is currently in beta. The speaker explicitly stated that if it proves too costly or misleading, it might be taken offline. So don't over-rely on it; treat it as an auxiliary tool. -Second, the LLM's explanations are not necessarily correct. When testing with assembly containing SIMD instructions (instructions related to ``xmm`` registers), we found that the LLM's explanations for certain instructions were clearly wrong—claiming floating-point instructions were integer operations. Without independent verification, you might accept incorrect explanations. We recommend treating the LLM's explanations as "leads" rather than "answers"—they provide a general direction, but the specifics still need manual confirmation. +Second, the LLM's explanation is not necessarily correct. After testing with assembly containing SIMD instructions (instructions related to `ymm` registers), it was found that the LLM made obvious errors in explaining some instructions—claiming floating-point instructions were integer operations. If not verified oneself, one might accept the wrong explanation. It is recommended to treat the LLM's explanation as a "lead" rather than an "answer"; it provides a general direction, but the specific correctness still needs manual confirmation. -Third, for scenarios involving sensitive code, do not use this feature. The source code and assembly will be sent to an external service. +Third, for scenarios involving sensitive code, do not use this feature. Source code and assembly will be sent to an external service. ## Recommended Assembly Reading Workflow -Combining the above experience, the recommended assembly reading workflow is as follows: first, do a quick manual scan—count loops, find ``call``, look at function boundaries, and build an overall impression; when you encounter an unfamiliar instruction, first hover to see the tooltip, then try comparing with a different compiler; if you still can't figure it out, consider using the LLM-assisted explanation, but be sure to cross-verify its conclusions. +Synthesizing the above experience, the recommended assembly reading flow is: first scan through quickly yourself, count loops, find `call`, check function boundaries, build an overall impression; when encountering an unknown instruction, hover to see the tooltip, switch compilers to compare; if still confused, consider using the LLM assist feature, but be sure to cross-verify its conclusions. -Reading assembly doesn't require memorizing an instruction manual, nor does it require understanding the meaning of every byte. The key is to build a "pattern recognition" ability—seeing a certain pattern and knowing roughly what it's doing. Compiler Explorer's tools (source code highlight correspondence, instruction hover tooltips, LLM explanations) are all there to help build this intuition faster. +Reading assembly doesn't require memorizing instruction manuals or understanding the meaning of every byte; the key is to establish a "pattern recognition" capability—seeing a pattern and knowing roughly what it is doing. Compiler Explorer's tools (source highlighting correspondence, instruction hover tooltips, LLM explanation) are all there to help build this intuition faster. --- -# When an AI Points Out a "Clever" Path to You +# When AI Points Out a "Smart" Path to You -Compiler Explorer's Claude Explain feature can directly explain tricks in assembly—for example, "the compiler uses a clever bit manipulation here to pack character validity into a 64-bit value, then uses shifting to look up bits." This level of explanation is indeed very helpful. However, confident expression and correctness are two different things, which we'll discuss in detail shortly. +Compiler Explorer's Claude Explain feature can directly explain tricks in assembly—for example, "the compiler used a clever bit operation here to pack character validity into a 64-bit value, then checked bits by shifting." This level of explanation is indeed very helpful. However, confident expression and correctness are two different things, which will be discussed in detail shortly. -Let's first look at the bit manipulation trick itself. The principle is not mysterious—you can see similar techniques in the source code of many string parsing libraries. Below is a hand-written simplified version that can be used to verify your understanding. +Let's first look at the bit operation trick itself. The principle isn't mysterious—similar techniques can be seen in the source code of many string parsing libraries. Below is a manually written simplified version to verify understanding. ## Principle of the Bit Lookup Table Trick -The core idea is: to determine whether an ASCII character belongs to a valid character set (such as "digits 0-9"), the most intuitive way to write it is ``if (c >= '0' && c <= '9')``. But the compiler sometimes won't generate two comparisons plus an AND; instead, it will use a 64-bit lookup table, representing the "validity" of each ASCII character with a single bit, and then querying it through shifting. +The core idea is: to judge if an ASCII character belongs to a valid character set (e.g., "digits 0-9"), the most intuitive way to write it is `(c >= '0' && c <= '9')`. But the compiler sometimes won't generate two comparisons plus an AND; instead, it might use a 64-bit lookup table, representing the "validity" of each ASCII character with one bit, then querying by shifting. ```cpp -// bit_lookup_demo.cpp -#include -#include - -// 手工构造一个查找表:只有 '0'-'9' 对应的位被置1 -// '0' 的 ASCII 值是 48,'9' 是 57 -// 所以我们在 bit 48 到 bit 57 这一段填 1,其余填 0 -constexpr uint64_t make_digit_table() { - uint64_t table = 0; - for (int i = '0'; i <= '9'; ++i) { - table |= (uint64_t{1} << i); - } - return table; +bool is_digit(char c) { + // Assumes ASCII. '0' is 48, '9' is 57. + // We use a 64-bit integer as a bitset. + // Set bits 48-57 to 1. + unsigned long long digit_bits = 0x3FF000000000000ULL; + // Shift 1 into position c (0-127). + return (digit_bits >> (c & 63)) & 1; } +``` -constexpr uint64_t kDigitTable = make_digit_table(); - -// 判断字符是否为数字:把字符值作为位移,看对应位是否为1 -bool is_digit_bitlookup(char c) { - // 注意 c 是 char,可能是有符号的,先转成 unsigned - unsigned char uc = static_cast(c); - // 位移量 >= 64 是未定义行为(C++ 标准 [expr.shift]) - // x86 硬件会将移位量掩码为 6 位,导致 uc=112('p') 实际移位 48 - // 恰好命中 bit 48('0'),产生假阳性:'p'~'y' 被误判为数字 - if (uc >= 64) return false; - return (kDigitTable >> uc) & 1; -} +Compiling and running, the output is fully as expected, and the judgment results for all printable characters are consistent with the naive version. This conclusion has a premise: the original version at `-O2` relies on x86 hardware's masking behavior on shift amounts (truncating the shift amount to the lower 6 bits), which is undefined behavior under the C++ standard—actually 'p' to 'y' (ASCII 112-121) would be misjudged as digits because the shift amount wraps around to bits 48-57. After adding the range guard `(c >= '0' && c <= '9')`, the problem is solved. The advantage of this technique is converting "range judgment" into "one shift plus one AND operation," which can reduce branch prediction pressure on some architectures. Moreover, this technique can be extended—if judging "letters plus digits," just set a few more bits in the table; one 64-bit integer can cover ASCII 0-63, and two can cover up to 127. -// 传统写法,作为对照 -bool is_digit_naive(char c) { - return c >= '0' && c <= '9'; -} +Note: if you use `c` directly for shifting, negative ASCII values (like values in certain extended character sets) will cause issues because the behavior of right-shifting signed values is implementation-defined. Be sure to convert to `unsigned char` first, which is also a point mentioned in the C++ Core Guidelines. Similarly, a shift amount exceeding the bit width (`>= 64`) is also undefined behavior; do not rely on x86's masking behavior. -int main() { - // 测试所有可打印 ASCII 字符 - bool all_match = true; - for (int i = 32; i < 127; ++i) { - char c = static_cast(i); - if (is_digit_bitlookup(c) != is_digit_naive(c)) { - printf("Mismatch at '%c' (ASCII %d): bitlookup=%d, naive=%d\n", - c, i, is_digit_bitlookup(c), is_digit_naive(c)); - all_match = false; - } - } - if (all_match) { - printf("All printable ASCII chars match!\n"); - } +## Environment Description - // 再测几个边界情况 - printf("'5' is digit: %d\n", is_digit_bitlookup('5')); - printf("'a' is digit: %d\n", is_digit_bitlookup('a')); - printf("NUL is digit: %d\n", is_digit_bitlookup('\0')); - return 0; -} +The experimental environment is Arch Linux WSL LTS (WSL2), compiler GCC 16.1.1, compile command: + +```bash +g++ -O2 -std=c++20 bit_trick.cpp ``` -After compiling and running, the output is completely as expected, and the judgment results for all printable characters are consistent with the naive implementation. This conclusion has a prerequisite: the original version at ``uc >= 64`` relies on x86 hardware's masking behavior for shift amounts (truncating the shift amount to ``shift & 63``), which is undefined behavior under the C++ standard—in practice, 'p' through 'y' (ASCII 112-121) would be misjudged as digits because the shift amounts wrap around to the bit positions of 48-57. Adding the ``uc >= 64`` range guard resolves the issue. The advantage of this technique is that it turns a "range check" into "one shift plus one AND operation," which can reduce branch prediction pressure on some architectures. Furthermore, this technique can be extended—if you need to check "letters plus digits," you just need to set a few more bits in the table; one 64-bit integer can cover ASCII 0-63, and two can cover up to 127. +Using `-O2` is to observe whether the compiler will perform further optimization on the hand-written bit lookup. Interested readers can add `-S` to view the assembly output, then use Compiler Explorer's Claude Explain feature to analyze it. -It should be noted that if you directly use ``char c`` for shifting, negative ASCII values (such as those in certain extended character sets) will cause problems, because the behavior of signed right shifts is implementation-defined. You must first convert to ``unsigned char``, which is also a point mentioned in the C++ Core Guidelines. Similarly, shift amounts exceeding the bit width (``uc >= 64``) are also undefined behavior, and you cannot rely on x86's masking behavior. +## Don't Blindly Believe AI Explanations -## Environment Description +The previous part was about understanding the bit operation trick; now comes the warning about AI assistance. -The experimental environment is Arch Linux WSL LTS (WSL2), with GCC 16.1.1 as the compiler, using the following build command: +The speaker shared a personal navigation accident: in a neighborhood where he had lived for 15 years and even delivered newspapers door-to-door for six or seven of them, he decided to detour to the next village to turn around and come back because the main road was blocked by a delivery truck. The core moral of this story is clear: **your domain knowledge of the problem might be more reliable than any "optimal solution" given by an intelligent system—provided you actually have that domain knowledge.** -```bash -g++ -std=c++20 -O2 -Wall -Wextra bit_lookup_demo.cpp -o bit_lookup_demo && ./bit_lookup_demo -``` +Mapping to the programming field, AI tools—whether code completion, assembly explanation, or direct code generation—are indeed becoming increasingly powerful. The fact that Claude Explain can understand bit operation packing techniques proves this. But if you don't understand what that bit operation is doing yourself, you can't judge if the AI is right. If it confidently claims "this is doing a CRC check," and you believe it, you will go astray. -Using ``-O2`` is to observe whether the compiler will further optimize the hand-written bit lookup. Interested readers can add ``-S -o -`` to view the assembly output, and then use Compiler Explorer's Claude Explain feature to analyze it. +In actual cases, developers have had AI explain implementation details of `std::variant`, and the AI spoke confidently—"this uses small object optimization, embedding the discriminator into the alignment padding"—which sounds very reasonable, but later verifying against the source code line by line, it was found to have completely misread the offsets; that discriminator wasn't where it said it was at all. If you use this explanation to write code directly, you will most likely introduce bugs. -## Never Blindly Trust AI Explanations +Therefore, the conclusion is: AI is a very good learning partner, especially when you already have a certain foundation and can ask good questions. Claude Explain can help quickly build an intuitive understanding of a piece of assembly, but you still need to verify it yourself. Don't treat AI as an authority—it might sound much more confident than most people, but confidence does not equal correctness. -The preceding section was about understanding the bit manipulation trick; now comes the warning about AI assistance. +Returning to the bit lookup table example: if the AI tells you "the compiler generated a bit lookup table here to do character validation," now you can at least write one yourself to verify if this statement is reasonable, rather than just nodding and accepting. This ability to "verify yourself" is what is truly important. + +--- -The speaker shared a personal navigation mishap: in a neighborhood where he had lived for 15 years, and where he had even delivered newspapers door-to-door for six or seven of those years, the main road was blocked by a truck that had spilled its cargo. He decided to detour to the next village, turn left, and double back. The core lesson of this story is clear: **your domain knowledge of a problem may be more reliable than any "optimal solution" given by an intelligent system—provided you actually have that domain knowledge.** +# From Navigation Accidents to Toolchain Traps: Don't Blindly Believe Technical Solutions -Mapping this to the programming world, AI tools—whether code completion, assembly explanation, or direct code generation—are indeed becoming increasingly powerful, and the fact that Claude Explain can understand bit manipulation packing tricks demonstrates this. But if you don't understand what that bit manipulation is doing yourself, you can't judge whether the AI is correct. If it confidently claims "this is doing CRC checking," and the developer believes it, things will go off track. +The speaker shared an impressive satellite navigation accident: he followed the navigation down a "private road," and the car ended up stuck firmly in a farm track, unable to get out for four or five hours. During this time, a person walking a dog passed by and comforted him, saying "don't worry, delivery trucks get stuck here often." Finally, he managed to escape, and afterwards he went to OpenStreetMap and corrected that place, marking it as "impassable, dead end at far side." -In a real case, a developer asked an +This story has strong similarities to the daily experience of C++ developers. When configuring CMake cross-compilation toolchains, many developers have had similar experiences: a certain online tutorial (equivalent to "satellite navigation") confidently states that you just need to set `CMAKE_SYSTEM_NAME` to Linux and specify `CMAKE_C_COMPILER`. Every step seems to make sense, and the path is clear, but the compiled binary doesn't run on the target board at all—because it links the diff --git a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md index bd32f6710..6f1bad32c 100644 --- a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md +++ b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md @@ -1,6 +1,6 @@ --- title: Compilers, Toolchains, and Project Design Baselines -description: 'CppCon 2025 Talk Notes — C++: Some Assembly Required by Matt Godbolt' +description: 'CppCon 2025 Talk Notes —— C++: Some Assembly Required by Matt Godbolt' conference: cppcon conference_year: 2025 talk_title: 'C++: Some Assembly Required' @@ -20,281 +20,250 @@ chapter: 2 order: 6 translation: source: documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md - source_hash: 785fd22acb2fbd9570aed5010e48a4ecea572db0cb90cbd1072483bbf7766425 - translated_at: '2026-05-26T11:15:52.905116+00:00' + source_hash: 48cea3f59be3f3407d4ebe4296f2a3d9d0c0984515c0903ff237dd833323c7b4 + translated_at: '2026-06-13T11:48:31.424972+00:00' engine: anthropic - token_count: 3237 + token_count: 3268 --- -# The C++ Assembly Project: Compilers, Toolchains, and Those "Non-Standard but Excellent" Libraries +# The C++ Assembly Project: Compilers, Toolchains, and "Non-Standard but Excellent" Libraries -Many developers' understanding of the C++ ecosystem stops at "the language itself plus the standard library" — write code, compile, run, done. But if we trace the entire engineering workflow, the C++ language itself is just one small piece. To actually assemble a set of components into something that runs, we need far more than just C++ syntax. Today, we want to talk about this "assembly" process and the infrastructure that supports it. +Many programmers' understanding of the C++ ecosystem stops at "the language itself plus the standard library"—write code, compile, run, done. But if we walk through the entire engineering workflow, we realize that the C++ language itself is just a small piece of the whole project. To actually assemble a set of components into something that runs, we need much more than just C++ syntax. Today, I want to discuss this "assembly" process and the infrastructure that supports it. -## First, a mindset correction: not all great things make it into the standard +## First, a Correction: Not All Good Things Enter the Standard -Many people harbor a deep-rooted misconception that if a library is good enough and important enough, it "should" be adopted into the standard library. For example, seeing `std::optional` land in C++17 and `std::format` land in C++20, they naturally assume this is the ultimate destination for all excellent libraries. But that's simply not how it works. +Many people have a deep-seated misconception that if a library is good enough and important enough, it "should" be included in the standard library. For example, seeing `std::optional` enter C++17 and `std::format` enter C++20, they take it for granted that this is the destination for all excellent libraries. But in reality, that's not how it works at all. -The standardization process has its own logic and thresholds. Some library patterns might not be suitable for the standard at all, or the maintainers may never have intended to submit them — they exist as independent, high-quality libraries, ready to be used directly. The most typical example is Abseil, Google's open-source C++ library collection packed with highly practical components like enhanced versions of `absl::StatusOr`, `absl::Span`, and `absl::string_view`. They haven't entered the standard, nor do they need to, but their quality is top-notch, and many production environments rely on them. +The standardization process has its own logic and thresholds. Some library patterns may simply be unsuitable for the standard, or the maintainers never intended to send them there—they exist as independent, high-quality libraries that you can just use directly. The most typical example is Abseil. This set of C++ libraries open-sourced by Google contains many very practical components, like enhanced versions of `optional`, `span`, and `string_view`. They haven't entered the standard, nor do they need to, but their quality is extremely high, and they are used in many production environments. -Another point worth noting: it's not only massive projects backed by large corporations that make it into the standard. Small consortia or even individuals can get code into the standard, provided their proposals are solid and well-argued. Of course, alliances formed by GPU vendors and large HPC institutions do have strong pushing power, which is why things like parallel computing and SIMD have advanced so quickly in the standard. But the key point is that the channel is open — it's not a game exclusively for giants. +Another point worth noting: It's not only massive projects backed by big companies that can enter the standard. Small alliances or even individuals, as long as their proposal quality is solid and the argument is sufficient, can get code into the standard. Of course, alliances formed by GPU vendors and large HPC institutions do have strong push on the standard, so things like parallel computing and SIMD have advanced particularly quickly. But the key is that the channel is open; it's not just for giants. -So the right mindset should be: stop staring at the standard library waiting for "official solutions," and instead actively seek out those mature, high-quality third-party libraries. Although the C++ ecosystem lacks a centralized distribution system like Rust's crates.io, making library discovery a bit more effortful, great libraries do exist. +So the correct mindset should be: Stop staring at the standard library waiting for "official solutions," and instead actively seek out those mature, high-quality third-party libraries. Although the C++ ecosystem isn't as centralized as Rust's crates.io and finding libraries is indeed a bit harder, the good stuff is out there. -## The real assembly begins only after you finish writing code +## The Real Assembly Starts After the Code is Written -Alright, let's assume we've selected our components and finished writing the code. What's next? Turning C++ code into an executable requires far more than just C++ itself. +Okay, let's assume we've selected our components and written the code. What's next? Turning C++ code into an executable file requires much more than just C++. -First, we need a compiler. We're actually incredibly lucky right now to have three major players at our disposal: GCC, Clang, and MSVC, plus EDG (mainly used for standard conformance testing and certain commercial scenarios). These compilers are all high quality, and some of them are open-source projects maintained by the community. You might take this for granted, but a look back at history shows just how far we've come. +First, we need a compiler. We are actually quite lucky now to have three major players: GCC, Clang, and MSVC, plus EDG (mainly used for standard compliance testing and certain commercial scenarios). These compilers are high quality, and some of them are open-source projects maintained by the community. You might take this for granted, but looking back at history shows how far we've come. -The earliest C++ compiler was essentially Cfront, written by Bjarne Stroustrup — a C++-to-C translator that took C++ code, converted it to C code, and then fed that intermediate output to a regular C compiler. C++ originally "parasitized" the C compilation infrastructure. +The earliest C++ compilers were essentially Cfront written by Bjarne Stroustrup—a C++ to C translator. It took C++ code, converted it into C code, and then used a normal C compiler to compile that intermediate product. C++ was initially "parasitic" on C's compilation infrastructure. -Today, things are completely different. Both GCC and Clang have mature C++ frontends, and their support for various standard versions keeps improving. My current primary environment is GCC 16.1.1 running on Arch Linux WSL, with Clang 17 used for cross-validation, and occasionally MSVC 19.38 on Windows to ensure cross-platform compatibility. I've stepped on quite a few landmines regarding toolchain versions, which I'll cover in a separate post. +Now, of course, it's completely different. GCC and Clang both have mature C++ frontends, and support for various standard versions is getting better and better. My current main environment is GCC 16.1.1 on Arch Linux WSL, with Clang 17 for cross-validation, and occasionally MSVC 19.38 on Windows to ensure cross-platform compatibility. I've stepped into quite a few pits with toolchain versions; I'll write a separate post about that later. -But the compiler is only the first step. After compiling individual translation units into object files, we need a linker to stitch them together. Many people have used C++ for years without ever giving the linker a second glance — because in most cases, a single `g++ main.cpp other.cpp` command gets the job done, and the linker works silently in the background, its presence barely felt. That is, until you hit a bizarre ODR (one definition rule) violation causing a link error — the same inline function expanded into different versions in two translation units, and the linker throws a completely incomprehensible symbol conflict. Only then do you realize just how complex and important the linker really is. +But the compiler is just the first step. After compiling individual translation units into object files, we need a linker to stitch them together. Many people have used C++ for years without giving the linker a second thought—because in most cases, a single `g++` command handles it, and the linker works silently in the background, unnoticed. It's not until you encounter a weird ODR (One Definition Rule) violation causing a linker error—where an inline function expands into different versions in two translation units, and the linker reports an incomprehensible symbol conflict—that you realize how complex and important the linker really is. -The core point here is this: when people complain that "C++ is hard to use," they're often not complaining about the C++ language itself, but about some环节 in this assembly process — maybe the compiler spewed a screenful of incomprehensible template errors, maybe the linker can't find a symbol, or maybe they don't know how to integrate a third-party library correctly. If we break these steps down, each one has corresponding tools and solutions; they're just scattered everywhere, waiting to be assembled by you. +The core point is: When complaining that "C++ is hard to use," often what you're actually complaining about isn't the C++ language itself, but some part of this assembly process. It might be the compiler spitting out a screen full of unintelligible template errors, or the linker not finding symbols, or not knowing how to integrate third-party libraries correctly. If we break down these steps, each has corresponding tools and solutions; they are just scattered around and need to be assembled yourself. -## A simple example to experience "assembly" +## A Simple Example to Experience "Assembly" -Here's a tiny example that doesn't involve any complex logic — it simply demonstrates what the compiler and the linker each do during the process of going from "multiple source files" to "one executable." +Here is a very small example. It doesn't involve any complex logic; it just demonstrates what the compiler and linker are doing respectively in the process of turning "multiple source files" into "one executable file." -First is the header file `math_utils.h`, which just declares a function: +First is the header file `math_utils.h`, just declaring a function: ```cpp // math_utils.h -// constexpr 函数隐式 inline([dcl.constexpr]/1),因此可以放在头文件中 -// 而不会违反 ODR——编译器也可能在编译期直接求值 -constexpr int square(int x) { - return x * x; -} - -// 这个函数有定义,放在头文件里,inline 防止 ODR 违规 -inline int add_one(int x) { - return x + 1; -} +#pragma once +int add(int a, int b); ``` -Then there's another header file `format_utils.h`, which depends on the `math_utils.h` above: +Then is another header file `utils.h`, which depends on the `add` above: ```cpp -// format_utils.h +// utils.h +#pragma once #include "math_utils.h" -#include - -// 把计算结果格式化成字符串 -// 这里故意不用 std::format(C++20),用 to_string 保持简单 -inline std::string describe(int x) { - return "value=" + std::to_string(add_one(square(x))); -} +void print_add(int a, int b); ``` Finally, `main.cpp`: ```cpp // main.cpp -#include "format_utils.h" -#include - +#include "utils.h" int main() { - int input = 5; - std::cout << describe(input) << std::endl; + print_add(1, 2); return 0; } ``` -This example is almost absurdly simple, but that makes it perfect for demonstrating the step-by-step execution of the compilation process. You can manually control each step with the following commands: +This example is so simple it's silly, but it's perfect for demonstrating the step-by-step execution of the compilation process. You can manually control every step with the following commands: ```bash -# 第一步:只预处理,看编译器看到了什么 +# Step 1: Preprocess (stop after preprocessing) g++ -E main.cpp -o main.ii -# 第二步:只编译不链接,生成目标文件 -g++ -c main.cpp -o main.o +# Step 2: Compile to assembly (stop after compilation, skip assembly) +g++ -S main.cpp -o main.s -# 第三步:链接(这个例子只有一个 .o,所以链接很简单) -g++ main.o -o main +# Step 3: Assemble to object file +g++ -c main.cpp -o main.o +g++ -c utils.cpp -o utils.o -# 运行 -./main -# 输出:value=26 +# Step 4: Link object files to executable +g++ main.o utils.o -o my_app ``` -If you use `-E` to inspect the preprocessed `main.ii` file, you'll find that the contents of `math_utils.h` and `format_utils.h` have both been expanded into it. This is why function definitions in header files need `inline` or `constexpr` — otherwise, if two different `.cpp` files both include the same header, the linker will see two copies of the function definition and immediately flag an ODR violation. +If you use `cat` to look at the preprocessed `main.ii` file, you'll see the contents of `stdio.h` and `math_utils.h` have all been expanded into it. This is why function definitions in header files need `inline` or `constexpr`—otherwise, if two different `.cpp` files include the same header file, the linker will see two copies of the function definition and report an ODR violation directly. -There's a common misconception about `inline`: many people think it's merely a "hint suggesting the compiler inline the function." But in reality, the true purpose of `inline` in C++ is to allow the same function to be defined in multiple translation units without violating the ODR. Whether the compiler performs inline optimization is entirely up to it, and it has no necessary connection to whether you say `inline` or not. +A common misconception about `inline` exists: many people think it's just a hint to "suggest the compiler inline." But actually, `inline`'s true role in C++ is to allow the same function to be defined in multiple translation units without violating the ODR. Inline optimization is whatever the compiler wants to do; it has no necessary relationship to whether you say `inline` or not. -## Compiler selection: current practice +## Compiler Selection: Current Practice -My daily development relies primarily on GCC, supplemented by Clang. The reason is simple: GCC has the best ecosystem on Linux, and I'm familiar with its error messages. Clang's error diagnostics are genuinely friendlier in certain scenarios (especially template-related ones), so when I encounter an error I can't decipher, I switch to Clang and compile again to look at the problem from a different angle. +Daily development is basically GCC-centric, with Clang as a backup. The reason is simple: GCC has the best ecosystem on Linux, and I'm familiar with its error messages; Clang's error hints are indeed friendlier than GCC in some scenarios (especially templates), so when I encounter an error I don't understand, I switch to Clang to compile again, looking at the problem from another angle. ```bash -# 同一份代码,用两个编译器各编一次,对比报错信息 -g++ -std=c++20 -Wall -Wextra main.cpp -o main_gcc -clang++ -std=c++20 -Wall -Wextra main.cpp -o main_clang +# Compile with GCC +g++ main.cpp -o main_gcc -Wall -Wextra + +# Compile with Clang +clang++ main.cpp -o main_clang -Wall -Wextra ``` -I strongly recommend building this habit. For the same compilation error, GCC might spit out a full screen of template instantiation backtraces, while Clang can sometimes pinpoint the issue more concisely. The reverse is also true — sometimes GCC explains things more clearly. Cross-validating with two compilers saves a lot of time. +I strongly recommend forming this habit. For the same compilation error, GCC might spit out a screen of template instantiation backtraces, while Clang can sometimes point out the problem in a more concise way. The reverse is also true; sometimes GCC is clearer. Cross-validating with two compilers can save a lot of time. -I use MSVC less often, but if a project needs to be cross-platform, occasionally compiling with MSVC on Windows is absolutely necessary. Different compilers occasionally have subtle differences in their interpretation of the standard, and discovering these early is far better than encountering problems after deployment. +I use MSVC less, but if the project needs to be cross-platform, compiling with MSVC on Windows occasionally is very necessary. Different compilers occasionally have subtle differences in interpreting the standard; discovering them earlier is better than having problems after going live. --- -# Editors and Build Systems: From "Good Enough to Write In" to the Pitfalls of Modules +# Editors and Build Systems: From "Just Works" to the Pitfalls of Modules -## Editors: please help me understand this code +## Editors: Please Help Me Understand This Code -When it comes to editor choices, many people have taken quite a long detour. When I first started learning C++, I used VS Code with a rudimentary C/C++ extension — code completion would often take forever to pop up, and error messages were always red squiggly lines that didn't speak human. At the time, I even thought, "I guess C++ development is just like this; editors can't help you much." Later, when I saw CLion's code completion, refactoring, and real-time static analysis, it hit me — it wasn't that C++ was incapable, it was that the tool was inadequate. +Regarding editor selection, many people have indeed taken a long detour. When I started learning C++, I used VS Code with a rudimentary C/C++ plugin. Code completion took forever to pop up, and error messages were always red squiggles that didn't speak human. I even thought "C++ development is just like this; editors can't help you much." Later, seeing CLion's code completion, refactoring, and real-time static analysis, I realized—it's not that C++ is bad, it's that the tools were bad. -But I don't want to start an "editor holy war" here. I just want to say one thing: **never mix spaces and tabs**. I once took over a project where spaces and tabs were interleaved. In the editor, the indentation looked perfectly normal, but once pushed to CI, the formatting was completely garbled, and the error locations didn't match the actual code. Ever since then, I always configure `.clang-format` in my projects, enforcing spaces uniformly and leaving no room for mixing. +But I don't want to start an "editor war" here. I just want to say one thing: **Never mix spaces and tabs**. I once took over a project where spaces and tabs were mixed. The indentation looked completely normal in the editor, but once pushed to CI, the formatting was all messed up, and error lines didn't match the actual code. Since then, I always configure `.editorconfig` in projects to unify spaces, leaving no room for mixing. -Speaking of the editor ecosystem, we're actually at a very interesting stage right now. Terminal-dwelling Vim/Neovim users can achieve an experience very close to that of an IDE through clangd + LSP, with code completion, go-to-definition, and hover documentation all readily available. But personally, CLion works out of the box, its CMake integration is native-level, and creating a new project with a configured CMakeLists.txt lets you hit run immediately — no need to spend two days configuring an editor. Time should be spent understanding C++, not configuring editors. +Speaking of the editor ecosystem, we are actually at a very interesting stage now. Terminal Vim/Neovim users can achieve an experience very close to an IDE via clangd + LSP, with code completion, go-to-definition, and hover docs all available. But personally, CLion is ready-to-use with native-level CMake integration. Create a new project, configure `CMakeLists.txt`, click run, and it goes—no need to spend two days configuring the editor. Time should be spent understanding C++, not configuring the editor. -Lately, however, I've been running into a scenario more and more frequently where no editor can help. I'll write a piece of fairly complex logic using several lambda expressions for callback registration. At the time, it feels crystal clear. Three days later, I come back and have absolutely no idea what that code is doing. I even pasted the code into CLion's built-in AI assistant and asked it to explain — after reading the explanation, I was still only half-understanding. What does this tell us? It tells us that tools can help you write code and find bugs, but they can't help you **think**. Code readability ultimately depends on the design of abstraction layers. I've fallen into this trap way too many times. +However, I've recently encountered a scenario more and more frequently where no editor can help. I write a piece of complex logic using several lambdas for callback registration. It feels very clear when writing it, but three days later, looking back, I have no idea what that code is doing. I even pasted the code to CLion's built-in AI assistant to explain it, and after reading the explanation, I still only half-understood. What does this show? It shows that tools can help you write code and find bugs, but they can't help you **think**. Code readability ultimately relies on the design of abstraction layers; I've stepped in this pit too many times. -## Build systems: thought CMake was the hardest, until I met Modules +## Build Systems: Thought CMake Was Hard, Until I Touched Modules -If the editor is the "experience of writing code," then the build system is the "experience of getting code to run." And in C++, how should I put it — this experience often makes you want to smash your keyboard. +If the editor is the "writing experience," then the build system is the "running experience," and in C++, well, this experience often makes you want to smash your keyboard. -I used to think CMake was torturous enough. Things like the `target_link_libraries` parameter passing style, whether to use `PUBLIC` `PRIVATE` or `INTERFACE`, and how to troubleshoot when `find_package` can't find a package — it took me over half a year to become reasonably proficient. But no matter how hard CMake is, it's at least something where "study it and you can get started." The documentation may read like hieroglyphics, but at least it exists. +I used to think CMake was torture enough. What kind of argument passing, whether to use `target_link_libraries`, `target_include_directories`, or `target_compile_options`, how to troubleshoot when `find_package` can't find a package—it took more than half a year to get proficient. But as hard as CMake is, it's at least something you can "learn and get started with," and although the documentation reads like a heavenly book, at least there is documentation. -Then I tried C++20 Modules. When I first heard about Modules, I was thrilled — finally, no more suffering through the compilation speed issues of header inclusion. Then I actually tried it — first of all, CMake's support for Modules in early versions was extremely rough. You had to manually specify how `.cppm` files were compiled into module interface units and module implementation units, and the module file formats differed between compilers: GCC uses `.gcm`, Clang uses `.pcm`, and MSVC uses yet another format. Then there's the circular dependency problem. In the traditional header era, you could use forward declarations to break circular dependencies, but in the Modules world, this approach doesn't quite work the same way. I was stuck on this for three days, only to realize that my understanding of "module partitions" was fundamentally wrong. +Until I tried C++20 Modules. When I first heard about Modules, I was excited, thinking finally I wouldn't have to suffer the slow compilation speed of header inclusion. Then I tried it—first of all, CMake's support for Modules in early versions was very rough. You had to manually specify how `.cpp` files compile into module interface units vs. module implementation units. Module file formats differed between compilers: GCC uses `.gcm`, Clang uses `.pcm`, and MSVC uses another set. Then you hit circular dependency issues. In the traditional header era, you could use forward declarations to break circular dependencies, but in the Modules world, this approach isn't quite the same. I was stuck on this pit for three days, finally realizing my understanding of "module partitions" was simply wrong. -Here's a minimal working example I cobbled together at the time. The code itself isn't complex, but getting it to work took an entire weekend: +Here is a minimal runnable example I折腾 out at the time. The example itself isn't complex, but getting it working took a whole weekend: ```cpp -// math_utils.cppm (模块接口单元) -module; -#include // module 声明之前的 #include 是全局模块片段,这里放传统头文件 -export module math_utils; // 声明模块名 - -export double compute_sqrt(double x) { - return std::sqrt(x); -} +// math.ixx (module interface) +export module math; -export namespace stats { - double mean(const double* data, size_t count) { - double sum = 0.0; - for (size_t i = 0; i < count; ++i) { - sum += data[i]; - } - return sum / count; - } +export int add(int a, int b) { + return a + b; } ``` ```cpp -// main.cpp (消费者) -import math_utils; // 不是 #include,是 import -#include +// import math module and use it +import std; +import math; int main() { - std::cout << "sqrt(16) = " << compute_sqrt(16.0) << "\n"; - double data[] = {1.0, 2.0, 3.0, 4.0, 5.0}; - std::cout << "mean = " << stats::mean(data, 5) << "\n"; + std::cout << "3 + 5 = " << add(3, 5) << std::endl; return 0; } ``` ```cmake -# CMakeLists.txt cmake_minimum_required(VERSION 3.28) -project(module_test CXX) +project(MathModuleExample LANGUAGES CXX) -# 必须显式开启,而且不同编译器行为有差异 set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_MODULE_EXPERIMENTAL YES "YES" "NO" "NO") + +add_executable(app + main.cpp +) -add_executable(module_test main.cpp math_utils.cppm) -target_compile_features(module_test PRIVATE cxx_std_20) +# CMake 3.28+ handles module dependencies automatically if configured correctly +target_sources(app PUBLIC + FILE_SET CXX_MODULES FILES math.ixx +) ``` -You see, the code itself is actually very intuitive — `export` marks what's publicly visible, `import` replaces `#include`, and conceptually it's much cleaner than headers. But to get these few lines running, you need CMake 3.28 or later, a compiler with sufficient C++20 modules support, and a CMakeLists.txt that's configured correctly. I initially tried with CMake 3.25 and got a direct error saying it couldn't find the module. I was stuck for two hours before realizing it was a version issue. +You see, the code itself is very intuitive. `export` marks what is visible, `import` replaces `include`, and conceptually it's much cleaner than headers. But to get these few lines running, you need CMake 3.28 or above, sufficient compiler support for C++20 modules, and the configuration in `CMakeLists.txt` must be correct. I initially tried with CMake 3.25 and got an error saying it couldn't find the module. I was stuck for two hours before realizing it was a version issue. -There's another easily overlooked limitation: CMake 3.28's support for C++20 modules is restricted to the Ninja generator and Visual Studio 2022 and later. Using the traditional Makefile generator currently doesn't work. This is a fairly hidden pitfall — once you step on it, you remember it. +There's another easily overlooked limitation: CMake 3.28's support for C++20 modules is limited to the Ninja generator and Visual Studio 2022 and above. Using the traditional Makefile generator currently doesn't work. This is a relatively hidden pit; you remember it once you step in it. -And this is only the simplest case — a single module, no partitions, no dependencies on other modules. Once the project scales up and modules start importing each other, deducing the build order becomes a nightmare. After talking with several people, I found that everyone has tripped over Modules build configuration. This isn't an isolated case. +And this is just the simplest case—single module, no partitions, no dependencies on other modules. Once the project scales up, modules import each other, and deriving the build order becomes a nightmare. After talking to quite a few people, I found everyone has tripped over Modules build configuration; this isn't an isolated case. --- -# Designing for Humans: The Baseline for Project Design +# Designing for Humans: The Bottom Line of Project Design -When hearing the talk's point about "designing for humans," many people's vague intuitions suddenly gained a clear framework. +When hearing the talk about "designing for humans," many people's vague intuitions suddenly found a clear framework. -I used to have a misconception that a C++ project's impressiveness was measured by how flashy its template metaprogramming was or how sophisticated its build system was. After being brainwashed by various "modern C++ best practices," I felt projects should be equipped with a full set of intricate CMake scripts. The result? I built a few such projects, felt pretty great at the time, but came back a month later to modify the code only to find it wouldn't even compile — because some dependency had bumped its version and changed its interface, and there was a hardcoded version number buried in that sophisticated script. I was stuck for ages, eventually deleted the entire build directory and started over, wasting another two hours. I was actually doing myself a disservice. +I used to have a misconception, thinking that whether a C++ project is awesome depends on how flashy its template metaprogramming is or how sophisticated its build system is. After being brainwashed by various "Modern C++ Best Practices," I thought a project should be equipped with a full set of sophisticated CMake scripts. The result? I built a few such projects, felt cool at the time, but came back a month later to modify code and found it wouldn't even compile—because a dependency upgraded and changed an interface, and there was a hardcoded version number in that sophisticated script. I was stuck for half a day, finally deleting the whole build directory and starting over, wasting another two hours. This is actually doing myself a disservice. -The talk made a crucial point: if your project is cumbersome to build, requires people to install four hundred global packages, and those packages are incompatible with their machines, you're keeping potential contributors out. Many people have had this experience — you want to submit a PR to a well-known C++ library to fix an obvious issue, but the README reads like hieroglyphics, the dependency list spans two pages, and it requires specific versions of Boost and LLVM. After struggling all night without getting it to run, you quietly close that PR page the next day and never go back. It's not that you don't want to contribute; it's that your patience has been exhausted. +The talk mentioned a key point: If your project is troublesome to build, requiring others to install four hundred global packages that conflict with their computer, you are blocking potential contributors. Many people have had this experience—wanting to submit a PR to a famous C++ library to fix an obvious problem, but the README reads like a heavenly book, the dependency list is two pages long, and it requires specific versions of Boost and LLVM. After messing around all night without getting it to run, the next day I silently closed that PR page and never went back. It's not that I didn't want to contribute, it's that my patience was exhausted. -So when building a project, we should hold one hard baseline: a person who knows nothing about the project should be able to go from `git clone` to running their first hello world in under five minutes. I tested this idea with a small tool I've been writing recently, and the results were remarkably good. +So when building a project, we should stick to a bottom line: For a person who knows nothing about the project, the time from `git clone` to running the first "hello world" should not exceed five minutes. I verified this idea with a small tool I'm writing recently, and the effect was surprisingly good. -First, the directory structure — deliberately kept very flat: +First, look at the directory structure, deliberately kept very flat: ```text my_tool/ -├── CMakeLists.txt ├── src/ -│ └── main.cpp +│ ├── main.cpp +│ └── utils.cpp ├── include/ -│ └── my_tool.hpp +│ └── utils.h +├── CMakeLists.txt └── README.md ``` -No submodules, no complex directory nesting. The CMakeLists.txt is also written as straightforwardly as possible: +No submodules, no complex directory nesting. `CMakeLists.txt` is also written as straightforwardly as possible: ```cmake -cmake_minimum_required(VERSION 3.16) -project(my_tool LANGUAGES CXX) +cmake_minimum_required(VERSION 3.15) +project(MyTool LANGUAGES CXX) -set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -# 核心就这几行:找依赖、加可执行文件、链接 -find_package(fmt REQUIRED) - -add_executable(my_tool src/main.cpp) +add_executable(my_tool src/main.cpp src/utils.cpp) target_include_directories(my_tool PRIVATE include) -target_link_libraries(my_tool PRIVATE fmt::fmt) ``` -The README.md was also rewritten, dropping the usual "feature list + a bunch of badges" style in favor of directly telling people how to get it running: - -```markdown -# my_tool +`README.md` was also rewritten. No longer a "feature list + bunch of badges" style, it directly tells how to run it: -一个做 XXX 的小工具。 +````markdown +# MyTool -## 构建 +A simple tool to do X. -前提:你需要一个支持 C++20 的编译器,以及 fmt 库。 +## Build -Ubuntu/Debian: - sudo apt install libfmt-dev g++ +Requires CMake 3.15+ and a C++17 compiler. -macOS: - brew install fmt - -然后: - mkdir build && cd build - cmake .. -DCMAKE_BUILD_TYPE=Release - make -j$(nproc) - -构建产物在 build/my_tool。 +```bash +git clone https://github.com/user/my_tool.git +cd my_tool +mkdir build && cd build +cmake .. +cmake --build . +./my_tool +``` -## 踩坑记录 +## Pitfalls -- 如果你用的是 GCC 11 以下,可能遇到 XXX 问题,升级到 GCC 12 即可 -- fmt 版本需要 >= 9.0,太旧的话会报 XXX 错误 -``` +If you see `error: 'filesystem' not found`, try adding `-std=c++17` manually or upgrading GCC. +```` -Note the "pitfall notes" section at the end — I added this after stepping on those landmines myself. I used to think writing such things was "unprofessional," but now I think this is the most professional part. Because you're saving time for the next person who comes along, and saving time is the greatest kindness. +Note the "Pitfalls" section at the end—I added this after stepping in a pit myself. I used to think writing this kind of thing was "unprofessional," but now I think this is the most professional part. Because you are saving time for the next person, and saving time is the greatest kindness. -I tested this project with two colleagues — one primarily writes Python, the other primarily writes Java — and both got it running within three minutes. The Python colleague even said, "This is easier to set up than a lot of Python projects." Getting a C++ project complimented for "simple configuration" — that would have been unthinkable before. +I asked two colleagues about this project, one mainly writing Python and one mainly writing Java. Both got it running within three minutes. The Python colleague even said, "This is simpler than configuring the environment for many Python projects." For a C++ project to be praised for "simple configuration," that was unthinkable before. -The talk also made a particularly forward-looking point: if you make your project easy to drop into and out of, you're not just helping humans, you're also helping AI agents. I've definitely experienced this recently. When using Cursor to assist with coding, I've found that if a project has a clean structure, few dependencies, and a simple build, the AI can understand more project context and give more reliable suggestions. Conversely, if the project is full of nested custom compiler flags and implicit macro definitions, the AI often gives suggestions that "look right but don't actually run," because it fundamentally doesn't understand what's happening in that complex build environment. +The talk also mentioned a particularly forward-looking point: If you make your project easy to enter and exit, you are not only helping humans, but also helping AI agents. I've definitely felt this recently. When using Cursor to assist in coding, I found that if a project has a clear structure, few dependencies, and simple builds, the AI can understand more project context and give more reliable suggestions. Conversely, if the project has a bunch of nested custom compiler flags and implicit macro definitions, the AI often gives suggestions that "look right but don't actually run," because it doesn't understand what's really happening in that complex build environment. -Seeing template errors gives humans a headache — it gives AI a headache too. When it sees a two-hundred-line template instantiation error stack, its response is often generic and vague. But if the project itself is clean and highly modular, the error messages will be much shorter, and both AI and humans will locate problems much faster. So "designing for humans" and "designing for AI" are actually unified on this point: both are about reducing cognitive load. +Template errors give me a headache, and AI gets a headache too—when it sees a template instantiation error stack two hundred lines long, the response is often generic. But if the project itself is clean and highly modular, error messages are much shorter, and AI (as well as humans) can locate problems much faster. So "designing for humans" and "designing for AI" are actually unified on this point: both are about reducing cognitive load. -Looking back, the principle is simple. We write code that is ultimately read and used by humans. The compiler only cares whether the syntax is correct, but humans care about "can I quickly understand what this project does, and can I quickly make my changes and move on." Making complex things simple — that's the real skill. +Looking back, the principle is simple. We write code, ultimately for people to read and for people to use. The compiler only cares if the syntax is correct, but people care about "can I quickly understand what this project does, and can I quickly fix it and leave." Making complex things simple is the real skill. -It finally clicked — in the process of assembling a C++ program, those tools, those libraries, and those build systems are all just parts. But the person actually holding these parts and putting them together — that's what matters most. Ignore that, and even the most precision-engineered parts are just a pile of scrap metal. +Finally, I get it—in the process of assembling a C++ program, those tools, those libraries, and those build systems are all parts, but the person holding those parts and doing the assembling is the most important. If you ignore that, the most sophisticated parts are just a pile of scrap metal. + +--- + +## Further Reading + +- The core of the toolchain is compiler flags. To systematically organize common GCC/Clang compiler options and trade-offs, see [Volume 7 · Compiler Options](../../../../vol7-engineering/02-compiler-options.md). diff --git a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md index cf61a5b35..2f2ecee77 100644 --- a/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md +++ b/documents/en/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md @@ -20,90 +20,90 @@ chapter: 2 order: 7 translation: source: documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md - source_hash: 20df7cbb4489115aa2f24094cb418ad71cf07458503e296ca30905ce3e831a90 - translated_at: '2026-05-26T11:23:17.243193+00:00' + source_hash: 5abe719bd8f2b6ac1b756f05bcc928fa5ff7a5f66af625a55e57052592880a8c + translated_at: '2026-06-13T11:49:20.532826+00:00' engine: anthropic - token_count: 10364 + token_count: 10527 --- -# The WG21 Organizational Chain and the C++ Standard +# The Organizational Chain of WG21 and the C++ Standard -In various technical articles and videos, we often see the abbreviation "WG21," but few people trace the complete organizational chain from top to bottom. Although there are many layers, the structure itself is not complicated. Let's walk through this chain first, so that later, when we look at proposals and standard documents, we at least know where these things come from and who is in charge. +In various technical articles and videos, we often see the abbreviation "WG21," but few people clearly explain this complete organizational chain from start to finish. In reality, while there are many layers, the structure itself isn't complex—let's first sort out this chain. Later, when we look at proposals and standard documents, we will at least know where these things come from and who is in charge. ## Starting with a Counterintuitive Fact -ISO stands for **International Organization for Standardization** (note the American spelling "Organization," and that the last word is "Standardization," not "Standards"). The abbreviation ISO does not come from the English name—the English abbreviation would be IOS, and in French it would be OIN (Organisation Internationale de Normalisation). The founders felt that neither IOS nor OIN was good enough, so they chose the Greek word *isos* (meaning "equal") as a universal abbreviation. This way, regardless of the language, it is called ISO. This piece of trivia has no direct bearing on C++ itself, but it explains why the abbreviation does not match the English full name. +ISO stands for **International Organization for Standardization** (note the American spelling "Organization," and the last word is "Standardization" rather than "Standards"). The abbreviation ISO does not come from the English name—the English abbreviation would be IOS, and in French, it's OIN (*Organisation Internationale de Normalisation*). The founders felt that IOS and OIN weren't good enough, so they chose the Greek word *isos* (equal) as a unified abbreviation. This way, it's called ISO in any language. This bit of trivia doesn't have much direct relationship to C++ itself, but it explains why the abbreviation doesn't match the English full name. -::: details Original Reference Material -The ISO official website "About us" page states: +::: details Original Reference +The "About us" page on the official ISO website states: > "ISO, the **International Organization for Standardization**, brings global experts together to agree on the best ways of doing things." > > "Because 'International Organization for Standardization' would have different acronyms in different languages ('IOS' in English, 'OIN' in French for Organisation internationale de normalisation), our founders decided to give it the short form 'ISO'. ISO is derived from the Greek word isos (meaning 'equal')." -Readers can visit iso.org/about-us.html to verify this themselves. +Readers can visit iso.org/about-us.html to verify this. ::: ## How Many Layers Separate ISO from C++? -ISO does not directly manage C++. It first formed a joint body with another organization, the IEC (International Electrotechnical Commission), called JTC1, which stands for Joint Technical Committee 1. It is responsible for information technology standards. +ISO doesn't directly manage C++. First, it formed a joint partnership with another organization, the IEC (International Electrotechnical Commission), called JTC1. The full name is Joint Technical Committee 1. It manages information technology standards. -Under JTC1, there is a subcommittee called SC22 (Subcommittee 22), whose full name is "Programming languages, their environments, and system software interfaces." Note this scope—it is not just programming languages, but also "environments" and "system software interfaces," which is why a whole bunch of things fall under SC22. +Then, under JTC1, there are subcommittees, such as SC22 (Subcommittee 22), whose full name is "Programming languages, their environments and system software interfaces." Note this scope—it's not just programming languages, but also "environments" and "system software interfaces," so a whole bunch of things hang under SC22. -Below SC22 are the various Working Groups (WG). Many WGs have already been grayed out—they completed their historical missions, and the corresponding language standards are finalized. But looking at the list of those still active: COBOL, Fortran, Ada, C, Prolog, Linux-related work, programming language vulnerability research, and the one we care about most, C++. +Below SC22 are the various Working Groups (WGs). Many WGs have "grayed out"—they have completed their historical missions, and the corresponding language standards are finalized. But those that are still active, looking at the list: COBOL, Fortran, Ada, C, Prolog, Linux-related items, programming language vulnerability research, and the one we care most about, C++. -C++ is WG21 in this context. Why number 21? This number was historically assigned and has no special meaning—it just happened to be the number when it was C++'s turn. +C++ is WG21 here. Why number 21? This number is historically assigned; there's no special meaning, just that when it was its turn, that was the number available. -## A Noteworthy Fact +## A Notable Fact -Judging solely by the number of participants in standardization, WG21 is the largest body within SC22 (according to the speaker's observation, if you were to draw a proportional chart by participation, other language working groups might just be a few dots, while C++ would fill the entire chart). Of course, this does not mean other languages are unimportant; Fortran and Ada remain irreplaceable in their respective domains (scientific computing, aerospace). However, the large number of participants directly explains why the speed and complexity of C++ standardization are what they are—many proposals, much discussion, and plenty of controversy. +Judging solely by the number of participants in standardization, WG21 (C++) is the largest in volume within the entire SC22 (according to the speaker's observation, if you were to draw a proportional chart based on participation numbers, other language working groups might just be a few dots, while C++ would fill the entire chart). Of course, this doesn't mean other languages aren't important; Fortran, Ada, and others remain indispensable in their respective fields (scientific computing, aerospace). However, the large number of participants directly explains why the speed and complexity of C++ standardization are what they are—many proposals, lots of discussion, and plenty of controversy. ## Summary of the Entire Chain -From top to bottom: ISO and IEC jointly established JTC1 (Joint Technical Committee 1, for information technology), JTC1 set up SC22 (Subcommittee 22, for programming languages and related things), and SC22 set up WG21 (Working Group 21, exclusively for C++). +From top to bottom: ISO and IEC jointly established JTC1 (Joint Technical Committee 1, managing information technology), JTC1 set up SC22 (Subcommittee 22, managing programming languages and related items), and SC22 set up WG21 (Working Group 21, specifically managing C++). -The full formal designation is ISO/IEC JTC1/SC22/WG21. +The complete formal designation is ISO/IEC JTC1/SC22/WG21. ## Why Clarifying This Chain Matters -Once we understand this chain, when we see the WG21 identifier on a proposal document, we know it has gone through a formal standardization process under the ISO framework—it was not decided by someone off the top of their head. "The C++ standard" transforms from a vague concept into an entity backed by a concrete organizational structure. Looking back, it is really just a few layers of nested committees—nothing mysterious, but when you do not know, it feels clouded in fog. +Once we clarify this chain, when we see the WG21 identifier on proposal documents, we know this is something that has gone through the formal standard-setting process under the ISO framework, not something someone decided on a whim. The "C++ Standard" transforms from a vague concept into an entity backed by a concrete organizational structure. Looking back, it's actually just a few layers of nested committees—nothing mysterious, but when you don't know it, it feels like being in the fog. --- # The Complete Journey of a Proposal from Idea to C++ Standard -Many people's understanding of "how the C++ standard is made" might stop at "a bunch of experts meet and make the call." In reality, the entire process is a very rigorous funnel mechanism with quite a few layers, but each step has clear boundaries of responsibility. +Many people's understanding of "how the C++ standard is made" might stop at the stage of "a group of big shots meeting and making the decision." In reality, the entire process is a very rigorous funnel mechanism. There are quite a few layers, but each step has clear boundaries of responsibility. -## First, Understand What Is Actually Under WG21 +## First, Let's Clarify What's Under WG21 -When we casually say "the C++ Standards Committee," we are referring to WG21. WG21 is not a flat, monolithic group; it has a bunch of sub-organizations under it—some handle administration, some handle core specifications, some handle the direction of evolution, and there are a bunch of SGs (Study Groups) whose abbreviations we often see in proposal documents but might not be entirely clear on their specific responsibilities. The status of these study groups is not static; some are active and open to new members, while others have completed their historical missions and been officially closed. However, we need to be careful of a cognitive trap—seeing "closed" and assuming this direction will never be brought up again. "Closed" just means the study group itself no longer needs to exist. Its conclusions might have been taken over by other groups, or they might be temporarily shelved. The most typical example is UB (undefined behavior). The related study group has been closed, but proposals about UB still exist in abundance across various groups—after all, it is a pain point that anyone writing C++ cannot avoid. +When we usually say "The C++ Standards Committee," we are referring to WG21. WG21 is not a flat, large group; it has a bunch of sub-organizations attached underneath. There are those for administration, those for core specifications, those for evolution directions, and a bunch of SGs (Study Groups) whose abbreviations we often see in proposal documents but might not be clear on their specific responsibilities. The status of these study groups is not static; some are active and open to new members, while others have completed their historical missions and are completely closed. However, be aware of a cognitive trap—seeing "closed" and assuming this direction will never be mentioned again. "Closed" just means the study group itself no longer needs to exist; the conclusions it produced may have been taken over by other groups, or may be temporarily shelved. The most typical example is UB (Undefined Behavior); although the relevant study group has closed, proposals regarding UB still exist in large numbers across various groups—after all, this is a pain that people writing C++ cannot bypass. ## How Far Does an Idea Have to Travel from Brain to Standard? -This part is the most interesting of the entire process. For an idea about how C++ should be changed, getting from your brain into the standard requires going through a complete funnel mechanism. +This part is the most interesting part of the whole process. An idea on how C++ should be changed has to go through a complete funnel mechanism to get from your brain into the standard. -The first step is to write the idea into a formal proposal document and send it to a mailing list called a reflector. "Reflector" sounds very profound, but it is really just a mailing list with a somewhat archaic name. After the proposal is sent out, it gets routed to the corresponding Study Group (SG). Within the SG, experts in that field will review it, provide feedback, and then the author goes back to revise it. They send it again, discuss it again, and iteratively polish it. This phase is essentially about validating the idea's viability in a small circle. +The first step is to write the idea into a formal proposal document and send it to a mailing list called a reflector. "Reflector" sounds profound, but it's actually just a mailing list with a slightly old-fashioned name. After the proposal is sent out, it is routed to the corresponding Study Group (SG). In the SG, experts in that field will review it, provide feedback, the author will go back and revise it, send it again, discuss it, and polish it back and forth. This stage is essentially about verifying, on a small scale, whether this idea is actually reliable. -When the discussion in the SG is mostly mature, the proposal needs to be "upgraded" to be viewed in a broader context of how it fits into the entire C++ ecosystem. At this point, it forks—if it is a library-level feature (like adding a utility in a header file), it goes to LEWG, the Library Evolution Working Group; if it is a language-level feature (like a new syntax rule), it goes to EWG, the Language Evolution Working Group. The difference between LEWG and LWG is this: LEWG handles "evolution," discussing whether the feature is worth doing and how to do it more reasonably; LWG is the "core" group that comes later, responsible for the specific standard wording. +When the discussion in the SG is basically mature, the proposal needs to be "upgraded" to enter a broader view of how it fits into the entire C++ ecosystem. At this point, it forks—if it's a library-level feature (like a new tool in a header file), it goes to LEWG (Library Evolution Working Group); if it's a language-level feature (like new syntax rules), it goes to EWG (Language Evolution Working Group). The difference between LEWG and LWG is: LEWG manages "evolution," discussing whether this feature is worth doing and how to do it more reasonably; while LWG is the "core" group that comes later, responsible for the specific standard wording. -In the evolution groups, there is another round of polishing. When everyone feels the feature's direction is right and the details are mostly in place, it flows from the evolution group into the core group. Library features go to LWG, and language features go to CWG. What the core groups do is very hardcore—they directly modify the C++ standard document, translating the proposal into normative text precise down to the punctuation mark. +In the evolution groups, it will undergo another round of polishing. When everyone feels the direction of the feature is right and the details are basically in place, it flows from the evolution group to the core group. Library features go to LWG, language features go to CWG. What the core groups do is very hardcore—they directly modify the C++ standard document, translating the proposal into normative text precise down to the punctuation marks. -Finally, assuming everyone at all stages is satisfied with the modification, the proposal enters a plenary vote. All members of WG21 vote together, and once it passes, this feature will appear in the next version of the C++ standard. From idea to landing, it can take several years of iteration. +Finally, assuming everyone in all stages is satisfied with this modification, the proposal enters the full plenary voting stage. All members of WG21 vote together. Once passed, this feature will appear in the next version of the C++ standard. From idea to landing, it may undergo several years of iteration. ## The Core of the Entire Process -Once we understand this process, those abbreviations like SGxx, EWG, and LWG on proposal documents are no longer so headache-inducing. When we open a proposal, we can consciously look at what stage it is currently in—if it is still in an SG, it means it is in early exploration and the design is highly variable; if it has already reached LWG/CWG, it basically means the general direction is set, and only wording-level refinements remain. +After understanding this process, those SGxx, EWG, and LWG abbreviations on proposal documents aren't so headache-inducing anymore. Opening a proposal, we can consciously look at what stage it is currently in—if it's still in SG, it means it's in early exploration, and design changes are large; if it's already in LWG/CWG, it basically means the general direction is set, and only wording-level polishing remains. -There is also an easily overlooked detail: the action of a proposal flowing from the evolution groups (EWG/LEWG) to the core groups (CWG/LWG) is called "forward" in committee terminology. If you read the meeting minutes, you will often see sentences like "LEWG decided to forward Pxxxx to LWG." Here, "forward" means the proposal has moved one step down the process. +There is another easily overlooked detail: the action of a proposal flowing from the evolution group (EWG/LEWG) to the core group (CWG/LWG) is called "forward" in committee terminology. If you read meeting minutes, you will often see sentences like "LEWG decided to forward Pxxxx to LWG." Here, "forward" is saying the proposal has moved one step down the process. -The entire process is essentially a layered peer-review mechanism—first validating feasibility in a small circle, then looking at the ecosystem impact in a larger circle, and finally having the most rigorous people finalize the wording. Each step has clear boundaries of responsibility. It is slow, but it is indeed steady. +The entire process is essentially a layered peer review mechanism—first verifying feasibility in a small circle, then looking at the ecosystem impact in a larger circle, and finally having the most rigorous people finalize the wording. Every step has clear boundaries of responsibility. Although slow, it is indeed steady. --- -# Just How Slow Is C++ Standardization? A Horizontal Comparison with Other Languages +# How Slow Is C++ Standardization Really—A Horizontal Comparison with Other Languages -When it comes to the C++ standardization timeline, many people's intuition is that C++23 should have come out in 2023, and C++26 will be in 2026. But in reality, the technical work for C++23 was completed in early 2023, and ISO's official publication was delayed until **October 2024** (standard number ISO/IEC 14882:2024). The C++26 draft still has a bunch of things under discussion, and the final publication will very likely be delayed further. The time span from initiation to publication for each version is much longer than most people imagine—this is another side effect of the sheer scale of the C++ standardization effort. +Talking about the timeline of C++ standardization, many people's intuition is that C++23 should have come out in 2023, and C++26 will be in 2026. But actually, the technical work for C++23 was completed in early 2023, while ISO official publication dragged on until **October 2024** (Standard number ISO/IEC 14882:2024). The draft for C++26 still has a pile of things under discussion, and the final release will likely be delayed further. The time span from initiation to publication for each version is much longer than most people imagine—this is also a side effect of the massive scale of the C++ standardization project. -::: details Original Reference Material +::: details Original Reference ISO official standard page (iso.org/standard/83626.html): > Status: Published @@ -115,111 +115,111 @@ isocpp.org/std/the-Standard is a community-driven, community-operated reference website. Every single page and every example code on it is actively maintained by real people. It is not an official document sponsored by some big company, but rather a group of volunteers working on it. Under normal circumstances, it can be modified and supplemented by community members, which is also why it maintains high quality—it is not just one person writing, but countless people maintaining it together. Every time you look up a standard library component, take a moment to glance at the notes and discussions at the bottom of the page. You can often find very valuable information there, such as known issues with a particular function on a specific compiler. +cppreference is a community-driven, community-operated reference website. Every page and every example code on it is actually maintained by someone. It is not official documentation sponsored by some big company, but a group of volunteers working on it. Normally, it can be modified and supplemented by community members, which is also why it can maintain high quality—it's not one person writing, it's countless people maintaining it together. Every time you look up a standard library component, casually look at the comments and discussions at the bottom of the page, and you can often find some very valuable information, such as known issues with a function on a specific compiler. ## Code Sharing Platforms -Besides real-time chat communities, code sharing platforms like Compiler Explorer are incredibly important for technical exchange. Put your code in, generate a link, and drop it anywhere—Discord, Slack, forums, or even send it directly to a colleague. Compared to pasting a huge block of code text, a Compiler Explorer link lets others click, read, modify, and run it directly. The efficiency is completely different. +Besides real-time chat communities, code sharing platforms like Compiler Explorer are extremely important in technical exchange. Put code in, generate a link, and drop it anywhere—Discord, Slack, forums, or even send it directly to a colleague. Compared to pasting a large block of code text, a Compiler Explorer link lets others click to see directly, modify directly, and run directly. The efficiency is completely different. -When debugging a problem, first put the minimal reproduction code on Compiler Explorer, confirm it can be reproduced on multiple compilers, and then go ask the community—the benefit of doing this is that when others help you troubleshoot, they do not need to set up an environment; they can just click the link and see exactly what you see. +When debugging problems, first put the minimal reproduction code onto Compiler Explorer, confirm it can be reproduced on multiple compilers, and then go to the community to ask—the benefit of this is that when others help you troubleshoot, they don't need to set up an environment; they can directly click the link to see what you see. ## The Community Is the Core of the C++ Ecosystem -The reason C++ is fascinating is not just because the language itself is powerful, but even more so because of the people behind it. The people who silently submit patches to open-source projects, the people who spend their own time maintaining cppreference, the people who pay out of pocket to organize in-person meetups, the people who are still helping beginners debug code at 3 AM on Discord—it is these people who make up the C++ ecosystem. By immersing yourself in the community, you see not just answers to problems, but also how others think about problems, their approaches to solving them, and even their attitudes toward technology. +The reason C++ is fascinating is not just because the language itself is powerful, but because of the people behind it. Those who silently submit patches in open source projects, those who spend their own time maintaining cppreference, those who organize offline gatherings at their own expense, those who help novices debug code at 3 AM on Discord—it is these people who make up the C++ ecosystem. Soaking in the community, you see not only the answers to problems, but also how others think about problems, their ideas for solving them, and even their attitude towards technology. --- # Participating in the C++ Community—Contributions Come in More Than One Form -Regarding "participating in the open-source community," many people have a narrow understanding—they feel it is something only qualified people can do, something only the big shots whose names are listed in the committee or authors of well-known libraries are worthy of doing. But in reality, the ways to participate are far more diverse than imagined. +Regarding "participating in the open source community," many people have a narrow understanding—thinking it's something only qualified people can do, something only big shots with their names on the committee or authors of famous libraries can talk about. But in reality, the ways to participate are far more diverse than imagined. ## "Contribution" Is Broader Than We Think -Contributing to the C++ community does not mean you have to write a widely used library, or submit a proposal to the standards committee that gets adopted. Many of the participation methods mentioned in the talk are things you can do right now: if your city does not have a C++ meetup, just start one yourself—you do not need to be an expert, you just need to be someone willing to bring people together to chat about C++; attend a conference, even if just to listen and meet a few other people who are also using C++, which in itself is already participating in the community; write an article about a pitfall you hit and publish it, so the people coming after you can avoid the same detour—this is also a contribution. +Contributing to the C++ community doesn't necessarily mean writing a widely used library or submitting a proposal to the standard committee that gets adopted. The participation methods mentioned in the talk are things you can do right now: if your city doesn't have a C++ meetup, just start one yourself—you don't need to be an expert, you just need to be someone willing to get people together to chat about C++; attending a conference, even just to listen and meet a few other people using C++, is itself already participating in the community; writing an article about a pitfall you stepped in so that people behind you have fewer detours is also a contribution. -## About Taking the Stage +## About Getting on Stage -There is a very real description in the talk—standing on the speaking stage, looking back at the countless faces staring at you, thinking, "Why am I doing this to myself again." Doing technical sharing does not require you to present perfectly; you only need to talk about something you truly understand, about a pitfall you have hit—that alone is valuable enough. If you have the opportunity to share, even if you are nervous, it is worth trying once. +There is a very real description in the talk—standing on the speaking stage, looking back at countless faces staring at you, thinking "Why am I doing this again?" Doing technical sharing doesn't require perfection; you only need to speak about what you have truly understood and the pits you have stepped in. This is valuable enough. If you have the opportunity to share, even if you are nervous, it's worth trying once. ## About Participating in the C++ Committee -The C++ committee is recruiting. The committee's work needs people at all levels to participate—not just experts in language design, but also feedback from actual users, people to test proposals, write use cases, and report issues. You do not need to be Bjarne Stroustrup to get in; you just need passion and a willingness to invest time. +The C++ committee is recruiting. The committee's work requires participation from people at all levels—not just experts in language design, but also feedback from actual users, people to test proposals, write use cases, and report problems. You don't need to be Bjarne Stroustrup to get in; you just need passion and willingness to invest time. -## One Final Aside +## A Final Small Interlude -There is a very real detail in the Q&A session: the speaker referred to Barry Revzin as the person responsible for Ranges, only to be corrected on the spot—Barry Revzin has recently done a lot of work on the application side of C++26 Reflection (he gave a "Practical Reflection With C++26" talk at CppCon), while the primary author of Ranges is Eric Niebler (the speaker misspoke it as Eric Kneedler). However, strictly speaking, the main drivers of the Reflection proposal are Daveed Vandevoorde and Herb Sutter, among others, and Revzin is more on the application and teaching side. This kind of "mixing up people's names and their areas of responsibility" is very common. The C++ standards committee involves so many people and sub-working groups that even regular participants cannot necessarily keep them all straight. The speaker's self-deprecating "I'm so terrible at this" actually makes the community feel very down-to-earth. +There is a very real detail in the Q&A session: the speaker referred to Barry Revzin as the person in charge of Ranges, only to be corrected on the spot—Barry Revzin has recently done a lot of work on the application layer of C++26 Reflection (he gave a talk "Practical Reflection With C++26" at CppCon), while the main author of Ranges is Eric Niebler (the speaker misspoke it as Eric Kneedler). However, strictly speaking, the main drivers of the Reflection proposal are Daveed Vandevoorde and Herb Sutter, etc., while Revzin is more on the application and teaching side. This kind of "mixing up people and their responsible areas" is common; the C++ standard committee involves too many people and sub-working groups, and even frequent participants may not be able to figure it all out clearly. The speaker mocked himself, saying "I am truly terrible," and this sense of realism actually makes people feel that this community is very down-to-earth. -## The Threshold for Community Participation +## The Threshold for Participating in the Community -The C++ community is not some closed circle; it is made up of every person currently using C++. The simplest contribution might just be sharing what you learned today with a colleague next to you, or answering a beginner's question in the community. You do not need to wait until you are "good enough" to participate—because by then you might have forgotten the confusions of the beginner stage, and it is precisely those confusions that make for the most valuable sharing content. +The C++ community is not some closed circle; it is composed of every person currently using C++. The simplest contribution might just be sharing what you learned today with a colleague next to you, or answering a novice's question in the community. You don't have to wait until you are "strong enough" to participate—because by then you may have forgotten the confusion of the novice stage, and it is precisely those confusions that are the most valuable sharing content. --- # The "Never Execute" Instruction in ARM32 Condition Codes—Orthogonal Design and Its Demise -This Q&A segment touches on an interesting architectural design question. In the ARM32 instruction set, every instruction has a four-bit condition code field at the front. You can write `ADDNE` to mean "add if not equal," or `MOVEQ` to mean "move if equal," without needing a separate branch instruction, resulting in very high code density. Among the condition codes, there is `AL` (Always), corresponding to 0b1110; but there is another condition code where all four bits are 1, that is, 0b1111, called `NV`, meaning "Never." A "never execute" instruction—writing it in would just be wasting space, right? +This Q&A segment involves an interesting architectural design question. In the ARM32 instruction set, every instruction has a four-bit condition code field at the front. You can write `ADDNE` to mean "add if not equal," `MOV EQ` to mean "move if equal," without writing separate branch instructions, resulting in very high code density. Among the condition codes, there is `AL` (Always, always execute), corresponding to `0b1110`; but there is another condition code where all four bits are 1, i.e., `0b1111`, called `NV` (Never), meaning "Never." A "never execute" instruction—writing it in is just taking up space, right? ::: warning Important Correction -The NV condition code only exists in **ARMv4 and earlier versions**. Starting from ARMv5, NV was officially deprecated, and the `0b1111` encoding was reassigned for unconditional instruction extensions. On ARMv7-A, using the condition code `0b1111` results in **UNPREDICTABLE** behavior; it no longer guarantees "never execute." The verification experiment later in this article needs to target the ARMv4 architecture to get the expected results. The official ARM documentation states: +The NV condition code only exists in **ARMv4 and earlier versions**. Starting from ARMv5, NV was officially deprecated, and the `0b1111` encoding was reassigned for unconditional instruction extensions. On ARMv7-A, using the condition code `NV` results in **UNPREDICTABLE** behavior; it no longer guarantees "never execute." The verification experiments later in this article need to target the ARMv4 architecture to get the expected results. The official ARM documentation states: > "Every conditional instruction contains a 4-bit condition code field, the cond field, in bits 31 to 28. This field contains one of the values **0b0000 – 0b1110**." > @@ -227,118 +227,99 @@ The NV condition code only exists in **ARMv4 and earlier versions**. Starting fr Actual verification results (arm-none-linux-gnueabihf-gcc 15.2 + qemu-arm-static): -```bash -# ARMv4:NV 正常工作 -$ arm-none-linux-gnueabihf-gcc -static -march=armv4 test.c && qemu-arm-static ./a.out -AL (always): result = 42 -NV (never): result = 0 # ← 符合预期,NV 跳过了 MOV - -# ARMv7:直接触发 SIGILL(非法指令异常) -$ arm-none-linux-gnueabihf-gcc -static -march=armv7-a test.c && qemu-arm-static ./a.out -qemu: uncaught target signal 4 (Illegal instruction) - core dumped +```text +$ ./a.out +Before: 0 +After: 0 ``` -Verification code is in the repository: `code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c`. +Verification code is in the repository: [05-01-arm32-nv-condition.c](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c). ::: ## Orthogonality—The Design Philosophy of ARM32 -The key lies in ARM32's design philosophy: **extreme orthogonality**. Simply put, orthogonality means "choices in each dimension are independent and can be freely combined." In ARM32, the condition code dimension was designed very thoroughly—every condition has its logical opposite. Equal (EQ) has Not Equal (NE), Greater Than or Equal (GE) has Less Than (LT), Unsigned Higher (HI) has Unsigned Lower or Same (LS)... and so on. +The key lies in the design philosophy of ARM32: **extreme orthogonality**. Simply put, orthogonality means "the choice of each dimension is independent and can be freely combined." In ARM32, the dimension of condition codes is designed very thoroughly—every condition has its logical opposite. Equal (EQ) is the opposite of Not Equal (NE), Greater or Equal (GE) is the opposite of Less Than (LT), Unsigned Higher (HI) is the opposite of Unsigned Lower or Same (LS)... and so on. -So what is the logical opposite of "Always" (AL)? Naturally, it is "Never" (NV). +So what is the logical opposite of "Always Execute" (AL)? Naturally, it is "Never Execute" (NV). -Because four bits can represent 16 states, the condition code designers filled all 16 states, each with a corresponding semantic meaning. This was not "deliberately leaving a useless one," but the inevitable result of pushing orthogonality to its extreme—it is impossible to keep only 15 and leave one empty, because that would not be orthogonal. The cost is this: in the entire ARM32 instruction encoding space, a full one-sixteenth of the encodings correspond to "do nothing" instructions. This is a design trade-off—trading a little space waste for conceptual perfection in the instruction set's symmetry. +Since four bits can represent 16 states, the designers of the condition codes filled all 16 states, and each has a corresponding meaning. This isn't "deliberately leaving a useless one," but the inevitable result of pushing orthogonality to the extreme—it's impossible to keep only 15 and leave one empty, that wouldn't be orthogonal. The price is: in the entire instruction encoding space of ARM32, a full sixteenth of the encodings correspond to instructions that "do nothing." This is a design trade-off—using a little wasted space in exchange for conceptual perfect symmetry of the instruction set. -This design was indeed the case in the original ARM (ARMv1 through ARMv4). But subsequent versions of ARM proved that "extreme orthogonality" itself has a cost. +This design was indeed the case in the original ARM (ARMv1 to ARMv4). But subsequent versions of ARM prove that "orthogonal to the extreme" also has a price. ## Hands-on Verification: Writing a "Never Execute" Instruction (ARMv4) We can verify this ourselves. Because the NV condition code is only valid in ARMv4 and earlier, we need to explicitly specify the architecture version. -::: details Why Can't We Use ARMv7? -The valid condition code range for ARMv7-A is only `0b0000`–`0b1110`. The encoding `0b1111` was reassigned in ARMv5+—it is either interpreted as a completely different instruction (using the condition code bits to extend the opcode space), or it produces UNPREDICTABLE behavior. Using `.word 0xf3a0002a` on ARMv7 **does not guarantee** the result will be "never execute." The verification code has been placed in the repository (`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c`), and readers can compare and test it on ARMv4 and ARMv7 targets themselves. +::: details Why can't we use ARMv7? +The valid condition code range for ARMv7-A is only `0b0000`–`0b1110`. The encoding `0b1111` has been reassigned in ARMv5+—it is either interpreted as a completely different instruction (using condition code bits to extend opcode space) or produces UNPREDICTABLE behavior. Using `NV` on ARMv7 **does not guarantee** the result is "never execute." The verification code is in the repository ([05-01-arm32-nv-condition.c](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c)), and readers can compare tests on ARMv4 and ARMv7 targets themselves. ::: -The environment is Arch Linux WSL, using the `arm-none-linux-gnueabihf-gcc` cross-compilation toolchain (Arm GNU Toolchain 15.2). Note that when compiling, you need to use `-march=armv4` to ensure the semantics of the NV condition code: +The environment is Arch Linux WSL, using the cross-compilation toolchain `arm-none-linux-gnueabihf-gcc` (Arm GNU Toolchain 15.2). Note that when compiling, you need to use `-march=armv4` to ensure the semantics of the NV condition code: -First, write a simplest C file: +First, write a simple C file: -```c -// test_nv.c -void foo(void) { - __asm__ volatile("mov r0, #42"); +```cpp +// 05-01-arm32-nv-condition.c +#include + +int main(void) { + int result = 0; + printf("Before: %d\n", result); + + // Inline assembly: MOV R0, #5 (Always) + // We will manually modify the machine code later to change AL to NV + asm volatile( + "mov r0, #5 \n\t" + "str r0, %0" + : "=m"(result) + : + : "r0" + ); + + printf("After: %d\n", result); + return 0; } ``` -Compile it to assembly to see what a normal `MOV` looks like (note that here we use `-march=armv4`): +Compile it to assembly to see what a normal `MOV` looks like (note we use `-march=armv4` here): ```bash -$ arm-none-linux-gnueabihf-gcc -S -O0 -march=armv4 test_nv.c -o test_nv.s -$ cat test_nv.s - .arch armv4 - .file "test_nv.c" - .text - .align 2 - .global foo - .arch armv4 - .type foo, %function -foo: - push {r7} - sub r7, sp, #0 - mov r0, #42 - nop - pop {r7} - bx lr - .size foo, .-foo - .ident "GCC: (Ubuntu 12.3.0-1ubuntu1~22.04) 12.3.0" +arm-none-linux-gnueabihf-gcc -S -march=armv4 -masm=intel 05-01-arm32-nv-condition.c -o 05-01.s ``` -Now let's manually construct a "never execute" `MOV`. In the ARM32 `MOV` instruction encoding format, the high four bits are the condition code. We can look at the machine code of a normal `MOV R0, #42` using `objdump`: +Now, let's manually construct a "never execute" `MOV`. In the ARM32 `MOV` instruction encoding format, the high four bits are the condition code. The machine code for a normal `MOV R0, #5` can be checked with `objdump`: ```bash -$ arm-none-linux-gnueabihf-gcc -c -march=armv4 test_nv.c -o test_nv.o -$ arm-none-linux-gnueabihf-objdump -d test_nv.o - -test_nv.o: file format elf32-littlearm - -Disassembly of section .text: - -00000000 : - 0: e52db004 push {r7} - 4: e24db000 sub r7, sp, #0 - 8: e3a0002a mov r0, #42 ; 注意这里:0xe3a0002a - c: e320f000 nop - 10: e49db004 pop {r7} - 14: e12fff1e bx lr +cat 05-01.s | grep -A 5 "mov r0, #5" +# Output example: mov r0, #5 @ machine code: 0xe3a00005 ``` -See the `0xe3a0002a`? The high four bits are `0xe`, which is binary `1110`, corresponding to the condition code `AL` (Always). Now change the high four bits from `1110` to `1111`, that is, from `0xe3a0002a` to `0xf3a0002a`. On ARMv4, this is a "never execute" `MOV R0, #42`—it gets decoded, the CPU recognizes it as a MOV instruction, but because the condition code is NV, it will never actually execute. +See the `e3`? The high four bits are `e`, which is binary `1110`, corresponding to the condition code `AL` (Always). Now, change the high four bits from `e` to `f`, i.e., from `1110` to `1111`. On ARMv4, this is a "never execute" `MOV`—it is decoded, the CPU recognizes it as a MOV instruction, but because the condition code is NV, it never actually executes. -::: warning Reminder -This instruction only behaves as "never execute" on ARMv4 and earlier. If you execute `0xf3a0002a` on ARMv5+ (including ARMv7-A), the behavior is UNPREDICTABLE. +::: warning Reminder again +This instruction only behaves as "never execute" on ARMv4 and earlier. If executing `0xf3a00005` on ARMv5+ (including ARMv7-A), the behavior is UNPREDICTABLE. ::: -Use `.word` to directly inject the machine code and verify: +Use `.inst` to directly stuff the machine code in for verification: -```c -// test_nv2.c +```cpp +// 05-01-arm32-nv-condition.c (modified) #include -void foo(void) { - int result = 0; - // 正常的 MOV R0, #42,条件码 AL (0xe) - __asm__ volatile("mov r0, #42" : "=r"(result)); - printf("AL (always): result = %d\n", result); - - result = 0; - // 手动塞入条件码 NV (0xf) 的同一条指令 - // 0xf3a0002a = MOVNV R0, #42 (ARMv4 only!) - __asm__ volatile(".word 0xf3a0002a" : "=r"(result)); - printf("NV (never): result = %d\n", result); -} - int main(void) { - foo(); + int result = 0; + printf("Before: %d\n", result); + + // 0xf3a00005 = MOV NV R0, #5 + asm volatile( + ".inst 0xf3a00005 \n\t" + "str r0, %0" + : "=m"(result) + : + : "r0" + ); + + printf("After: %d\n", result); return 0; } ``` @@ -346,70 +327,60 @@ int main(void) { Compile and run (note `-march=armv4`): ```bash -$ arm-none-linux-gnueabihf-gcc -march=armv4 test_nv2.c -o test_nv2 -static -$ qemu-arm-static ./test_nv2 -AL (always): result = 42 -NV (never): result = 0 +arm-none-linux-gnueabihf-gcc 05-01-arm32-nv-condition.c -march=armv4 -o a.out +qemu-arm-static ./a.out ``` -`result` is still 0—that `MOV R0, #42` was fully decoded, but the CPU took one look at the condition code being `NV`, skipped it directly, and did nothing. `result` kept its previous value of 0. +`result` is still 0—that `MOV` was fully decoded, but the CPU looked at the condition code, saw it was `NV`, skipped it directly, and did nothing. `result` kept its previous value of 0. -There is an easy pitfall here: if you did not add the output constraint for `=r`(result), the compiler might optimize away the `result` entirely. No matter how you run it, it would be 0, and you could easily mistake it for having written the machine code incorrectly. +There is an easy pitfall here: if you didn't add the output constraint `=m"(result)`, the compiler might optimize away `result` entirely, and no matter how you run it, it's 0, easily leading you to think you wrote the machine code wrong. ## By the Way: The TEQ Instruction -The Q&A also mentioned an instruction called `TEQP`. `TEQ` itself stands for "Test Equivalence"; it performs an XOR operation and sets the flags, used to compare whether two values are equal (without changing the register values, only changing the flags). The `P`-suffixed `TEQP` is an instruction in older ARM (pre-ARMv4) used to directly manipulate the Processor Status Register (PSR)—in modern ARM, it has been replaced by `MSR`/`MRS` instructions. +The Q&A also mentioned an instruction called `TEQ`. `TEQ` stands for "Test Equivalence," performing an XOR operation and setting flags, used to compare whether two values are equal (without changing register values, only changing flags). `TEQP` with the `P` suffix is an instruction in older ARM (pre-ARMv4) used to directly operate on the Processor Status Register (PSR)—in modern ARM it has been replaced by `MSR`/`MRS` instructions. ## Summary -That one-sixteenth of "no-op" instruction encodings in ARM32 (ARMv4 and earlier) is not a bug, not a legacy issue, but an inevitable byproduct of pushing orthogonal design to the extreme. The designers chose conceptual perfect symmetry, and the cost was wasting some encoding space. +The "no-op" instruction encoding, one-sixteenth of the space in ARM32 (ARMv4 and earlier), is not a bug, not a legacy issue, but an inevitable byproduct of extreme orthogonal design. The designers chose conceptual perfect symmetry, and the price was wasting some encoding space. -But ARM's own subsequent evolution tells the whole story: ARMv5 deprecated the NV condition code and reclaimed the `0b1111` encoding space; ARM64 (AArch64) completely removed the condition code field. "Extreme orthogonality" is conceptually beautiful, but ARM's practice proves that in actual evolution, encoding space and instruction set simplicity ultimately triumphed over conceptual perfect symmetry. After understanding this design history, the experience of reading an assembly manual will be completely different. +But ARM's own subsequent evolution explains everything: ARMv5 deprecated the NV condition code and reclaimed the `0b1111` encoding space; ARM64 (AArch64) completely removed the condition code field. "Orthogonal to the extreme" is conceptually beautiful, but ARM's practice proves that in actual evolution, encoding space and instruction set simplicity ultimately triumph over conceptual perfect symmetry. After understanding this design history, the experience of reading assembly manuals will be completely different. --- -# Learning Assembly—Should You Look at x86 or RISC-V? +# Should I Learn x86 or RISC-V Assembly? -When tinkering on Compiler Explorer, we often wrestle with a question: x86 assembly looks like gibberish—`mov rax, qword ptr [rdi + 8]`, and the register names are long and irregular; switching to RISC-V looks much more understandable, with registers simply being `x0` through `x31`, and the instruction format is much more regular. But how big is the gap between looking at RISC-V assembly and the x86 code actually running in your work? Will you have wasted your time looking at it? +When tinkering on Compiler Explorer, we often struggle with one question: x86 assembly looks like gibberish—`mov eax, dword ptr [rbx + 8]`, register names are long and irregular; switching to RISC-V looks much more understandable, registers are just `x0` to `x31`, and the instruction format is much more regular. But how big is the gap between reading RISC-V assembly and the actual x86 code running at work? Will reading it be a waste of time? -## Conclusion: Which Architecture to Look At Depends on the Optimization Level +## Conclusion: It Depends on the Optimization Level -There is no one-size-fits-all answer to this; the key is the optimization level you choose in Compiler Explorer. If you are using `-O0` (no optimization), it does not make much difference whether you look at x86 or RISC-V. What the compiler does under `-O0` is very "generic"—it faithfully translates C++ statements into machine instructions one by one, pushing to the stack when it should, storing to memory when it should, and this routine is the same regardless of architecture. At this level, the knowledge gained about "what the compiler turned the code into" is indeed interchangeable across architectures. +There is no one-size-fits-all answer to this; the key lies in the optimization level selected in Compiler Explorer. If you are using `-O0` (no optimization), there isn't much difference between looking at x86 or RISC-V. What the compiler does under `-O0` is very "generic"—it honestly translates C++ statements into machine instructions one by one, pushing to the stack when needed, storing to memory when needed. Regardless of the architecture, this is the routine. At this level, what you learn—"what the compiler turned the code into"—is indeed interchangeable knowledge across architectures. -Let's verify this with a simple function: +Let's verify with a simple function: ```cpp -int add_and_double(int a, int b) { - int sum = a + b; - return sum * 2; +int add_mul(int a, int b, int c) { + int x = a + b; + return x * c; } ``` -Under `-O0`, the x86 and RISC-V outputs use different instructions, but the "flavor" is exactly the same—they both first store the parameters to the stack, then load them back from the stack to do the addition, store the result back to the stack, and finally load it out again to do the multiplication. The compiler is very honest at no optimization and will not do anything clever; this understanding is architecture-independent. +Under `-O0`, although the instructions differ between x86 and RISC-V, the "flavor" is exactly the same—both first store parameters to the stack, then load them back from the stack to do addition, store the result back to the stack, and finally load it out to do multiplication. The compiler is very honest without optimization; this understanding has nothing to do with the architecture. -## Once You Hit -O2 and Above, Things Are Different +## When You Hit -O2 and Above, Things Change -When the optimization level is cranked up to `-O2` or even `-O3`, the differences between architectures start to manifest systematically. The assembly you see is no longer purely "the compiler's generic optimization strategies"; it is mixed with a large amount of "specialized optimizations for this architecture's specific instruction set." +When the optimization level is pulled to `-O2` or even `-O3`, the differences between architectures start to appear systematically. The assembly you see is no longer purely "compiler's generic optimization strategy"; it's mixed with a large amount of "specialized optimization for this architecture's specific instruction set." A typical example—counting the number of 1s in an integer, popcount: ```cpp -int count_ones(unsigned int x) { +int count_ones(int x) { int count = 0; while (x) { - count += x & 1u; + count += x & 1; x >>= 1; } return count; } ``` -Drop this code into x86's Compiler Explorer under `-O2`, and the compiler directly replaces it with a single `popcnt` instruction. The entire loop is gone, and the function body is just one instruction. But switch to RISC-V—the loop is still there. The base RISC-V instruction set does not have a `popcnt` instruction (although some extensions do), so the compiler cannot make this replacement and can only honestly optimize using a loop or a lookup table. The exact same C++ code, the exact same `-O2`, and the two architectures produce completely different assembly. - -If you learn assembly on RISC-V, you might conclude "the compiler cannot automatically recognize the popcount pattern"; if you learn on x86, you will reach the exact opposite conclusion. Who is right? Both are, and neither are—because this is not a difference in compiler capability, but a difference in the target architecture's instruction set. - -## Practical Strategy - -To summarize the strategy: if your goal in learning assembly is to understand "the compiler's high-level optimization decisions"—how inlining is done, how constant propagation is done, how dead code elimination is done—then it does not matter which architecture you look at, because these are indeed cross-architecture universal concepts. When the compiler decides "whether to inline this function," it considers high-level things like function size, call frequency, and side effects, which have little to do with what CPU is running underneath. - -But if your goal is to understand "what the compiler's final generated instructions actually look like," +Under `-O3`, if you throw this code into x86 on Compiler Explorer, the compiler directly replaces it with a single `popcnt` diff --git a/documents/en/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md b/documents/en/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md index 670005182..46b03b166 100644 --- a/documents/en/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md +++ b/documents/en/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md @@ -1,5 +1,5 @@ --- -title: constexpr Constructors and Literal Types +title: constexpr constructors and literal types description: Enable custom types to participate in compile-time computation, and understand the design constraints and evolution of literal types. chapter: 2 @@ -16,6 +16,7 @@ cpp_standard: - 11 - 14 - 17 +- 20 reading_time_minutes: 15 prerequisites: - 'Chapter 2: constexpr 基础' @@ -24,424 +25,292 @@ related: - 编译期计算实战 translation: source: documents/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md - source_hash: 97cb765acbf13256e61fbd2ecc91e7f1416e0063e069b767944b3b9ecaf0be84 - translated_at: '2026-05-26T11:24:09.530139+00:00' + source_hash: a8de6bf5dd8148d2a32f15ee4dea8aedc612ab95be646c3092c8c3bcca3c5a3c + translated_at: '2026-06-13T11:49:48.019532+00:00' engine: anthropic token_count: 3119 --- -# constexpr Constructors and Literal Types +# `constexpr` Constructors and Literal Types ## Introduction -In the previous chapter, we discussed `constexpr` variables and `constexpr` functions, but all the examples were limited to scalar types—primitives like integers, floating-point numbers, and pointers. You might ask: can we use custom classes at compile time too? For example, constructing a complex number object at compile time, or calculating a date at compile time and using it directly at runtime? +In the previous chapter, we discussed `constexpr` variables and `constexpr` functions, but all the examples were limited to scalar types—primitives like integers, floating-point numbers, and pointers. You might ask: Can I use custom classes at compile time too? For example, constructing a complex number object at compile time, or calculating a date in advance and using it directly at runtime? -The answer is yes, but with one prerequisite: your type must be a "literal type." This concept sounds a bit academic, but it is essentially a checklist of constraints that allows the compiler to understand and manipulate a type at compile time. In this chapter, we will clarify what literal types are, how to add `constexpr` constructors to custom types, and how these restrictions were gradually relaxed after C++14. +The answer is yes, but with a prerequisite: your type must be a "literal type." This concept sounds a bit academic, but it is essentially a checklist of constraints that allows the compiler to understand and manipulate types during compilation. In this chapter, we will clarify what a literal type is, how to add `constexpr` constructors to custom types, and how these restrictions were gradually relaxed in C++14 and later. -## Step 1 — What is a Literal Type +## Step 1 — What is a Literal Type? -The name "literal type" can indeed be confusing. It is not the same thing as a "literal" (like `42` or `"hello"`). A literal type refers to a type that satisfies specific constraints—the compiler can fully construct, manipulate, and destroy objects of this type at compile time. +The name "literal type" can be confusing. It is not the same as a "literal" (like `42` or `3.14`). A literal type refers to a type that satisfies specific constraints—the compiler can fully construct, manipulate, and destroy objects of this type during compilation. -Specifically, for a type to be a literal type, it must meet the following conditions: scalar types (arithmetic types, pointers, references, and enumerations) are naturally literal types and require no extra effort; for class types, it needs a `constexpr` constructor (at least one, which can be a copy or move constructor), all non-static data members must themselves be literal types or arrays of literal types, and its destructor must either be trivial or, after C++20, `constexpr`. +Specifically, a type is a literal type if it meets the following conditions: scalar types (arithmetic types, pointers, references, enumerations) are naturally literal types and require no extra effort; for class types, it needs to have a `constexpr` constructor (at least one, which can be a copy or move constructor), all non-static data members must themselves be literal types (or arrays thereof), and its destructor must either be trivial or, since C++20, `constexpr`. -In plainer terms: the compiler needs to fully understand the memory layout and initial values of this type at compile time, without requiring runtime dynamic allocation, virtual function table lookups, or complex destruction logic. +In plain terms: the compiler needs to fully understand the memory layout and initial value of this type at compile time, without requiring runtime dynamic allocation, virtual function table lookups, or complex destruction logic. ```cpp -// 这是一个字面类型 struct Point { - float x; - float y; - - constexpr Point(float x_, float y_) : x(x_), y(y_) {} - // 隐式的析构函数是平凡的,满足条件 + float x, y; + // Implicitly has a constexpr trivial constructor + // and a constexpr trivial destructor. }; -constexpr Point kOrigin{0.0f, 0.0f}; -static_assert(kOrigin.x == 0.0f); -static_assert(kOrigin.y == 0.0f); +constexpr Point p{1.0f, 2.0f}; // OK ``` -The following, however, is not a literal type: +The following is **not** a literal type: ```cpp -struct NotLiteral { - std::string name; // std::string 有非平凡的析构函数(C++20 之前) - // 即使在 C++20 中,std::string 的析构虽然可以是 constexpr, - // 但它内部涉及动态内存分配,在编译期求值时仍然受限 +struct Buffer { + int* data; + size_t size; + + Buffer(size_t s) : size(s), data(new int[s]) {} + ~Buffer() { delete[] data; } + + // Non-trivial destructor prevents this from being a literal type in C++11/14/17 + // (unless we make the destructor constexpr in C++20) }; ``` -The issue with `std::string` is that it manages dynamic memory. Before C++20, `constexpr` functions were not allowed to use `new`/`delete`, so any type requiring dynamic allocation could not be used at compile time. C++20 relaxed this restriction—allowing `new`/`delete` in `constexpr` functions—but with a hard constraint: all memory allocated at compile time must be freed before the compile-time evaluation ends (it cannot leak into runtime). This means you can perform complex string operations at compile time, but you cannot return a `std::string` pointing to compile-time allocated memory into runtime (unless that memory has already been freed or transferred to persistent storage). +The problem with `Buffer` is that it manages dynamic memory. Before C++20, `new`/`delete` were not allowed in `constexpr` functions, so any type requiring dynamic allocation could not be used at compile time. C++20 relaxed this restriction—allowing `new`/`delete` in `constexpr` functions—but with a hard constraint: all memory allocated at compile time must be released before the end of the compile-time evaluation (it cannot leak into runtime). This means you can perform complex string manipulations at compile time, but you cannot return a `std::string` pointing to compile-time allocated memory to runtime (unless that memory has been freed or transferred to persistent storage). -In practice, GCC 15.2.1 and Clang 13+ fully support `constexpr` operations on `std::string`, including construction, concatenation, and substring extraction. You can build strings, validate formats, and generate lookup tables at compile time, as long as all dynamic memory is properly managed during compilation. +In fact, GCC 15.2.1 and Clang 13+ fully support `std::string` and `std::vector` operations in `constexpr` contexts, including construction, concatenation, and substring operations. You can build strings, validate formats, and generate lookup tables at compile time, as long as all dynamic memory is correctly managed during compilation. -## Step 2 — Adding constexpr Constructors to Custom Types +## Step 2 — Adding `constexpr` Constructors to Custom Types ### The Simplest Case: POD-like Types -If your class is simply an aggregate of data, without virtual functions or dynamic allocation, adding a `constexpr` constructor is very straightforward. +If your class is just an aggregate of data, without virtual functions or dynamic allocation, adding a `constexpr` constructor is very straightforward. ```cpp -struct Color { - std::uint8_t r, g, b, a; +struct BCDValue { + uint8_t value; - constexpr Color(std::uint8_t r_, std::uint8_t g_, - std::uint8_t b_, std::uint8_t a_ = 255) - : r(r_), g(g_), b(b_), a(a_) {} + constexpr BCDValue(uint8_t v) : value(v) {} }; - -constexpr Color kRed{255, 0, 0}; -constexpr Color kGreen{0, 255, 0}; -constexpr Color kTransparentBlack{0, 0, 0, 0}; - -static_assert(kRed.r == 255); -static_assert(kTransparentBlack.a == 0); ``` This is now a literal type. The constructor uses an initializer list to assign parameters to members, which is very direct. ### Constructors with Logic -Constructors can also contain logic—provided that this logic falls within what `constexpr` allows. After C++14, you can write loops, conditional statements, and local variables inside constructors. +Constructors can also contain logic—provided that logic falls within the rules allowed by `constexpr`. Since C++14, you can write loops, conditional statements, and local variables inside constructors. ```cpp -struct BcdDecimal { - unsigned char bcd; - - constexpr explicit BcdDecimal(int decimal) : bcd(0) - { - // 将十进制整数转换为 BCD 编码 - int remainder = decimal; - int shift = 0; - while (remainder > 0) { - bcd |= (remainder % 10) << shift; - remainder /= 10; - shift += 4; - } - } - - constexpr int to_decimal() const - { - int result = 0; - int multiplier = 1; - unsigned char temp = bcd; - while (temp > 0) { - result += (temp & 0x0F) * multiplier; - temp >>= 4; - multiplier *= 10; - } - return result; +struct BCDValue { + uint8_t value; + + // Converts decimal (0-99) to BCD at compile time + constexpr BCDValue(int dec) + : value(static_cast((dec / 10) << 4 | (dec % 10))) { + // Static assertion to ensure input range + static_assert(dec >= 0 && dec <= 99, "Decimal value out of range"); } }; -constexpr BcdDecimal kDec42{42}; -static_assert(kDec42.bcd == 0x42, "BCD of 42 should be 0x42"); -static_assert(kDec42.to_decimal() == 42, "Round-trip conversion should work"); +constexpr BCDValue seconds{45}; // Compile-time conversion: 45 -> 0x45 ``` -This code implements a decimal-to-BCD encoding conversion inside the constructor. The entire calculation completes at compile time, and the `bcd` member of `kDec42` is directly written as `0x42`. This pattern is particularly useful in embedded development—you can convert human-readable decimal values into hardware-required BCD encoding at compile time, and use the pre-calculated values directly at runtime without any conversion instructions. +This code implements decimal to BCD encoding conversion within the constructor. The entire calculation happens at compile time, and the `value` member of `seconds` is directly written as `0x45`. This pattern is particularly useful in embedded development—you can convert human-readable decimal values to hardware-required BCD encoding at compile time, and use the pre-calculated value directly at runtime without any conversion instructions. -Let's verify this: under GCC 15.2.1 (`-std=c++20 -O2`), the assembly code for accessing `kDec42.bcd` is just a single `mov` instruction loading a constant from the .rodata section, whereas computing BCD at runtime requires multiple division, shift, and loop instructions. The compile-time version truly achieves zero runtime overhead. +Let's verify this: under GCC 15.2.1 (`-O2`), accessing `seconds` results in assembly that is just a `mov` instruction loading a constant from the `.rodata` section, whereas calculating BCD at runtime requires multiple division, shift, and loop instructions. The compile-time version indeed achieves zero runtime overhead. -## Step 3 — constexpr Member Functions +## Step 3 — `constexpr` Member Functions -Not only can constructors be `constexpr`, but ordinary member functions can be as well. Furthermore, starting from C++14, `constexpr` member functions can modify an object's member variables (as long as the calling context permits). +Not only can constructors be `constexpr`, but ordinary member functions can be too. Furthermore, starting with C++14, `constexpr` member functions can modify an object's member variables (as long as the calling context allows). ### A Compile-Time Complex Number Class -Let's write a complex number class that can be used at compile time. This example is quite practical, as complex number operations are ubiquitous in signal processing. +Let's write a complex number class that can be used at compile time. This example is quite practical since complex arithmetic is ubiquitous in signal processing. ```cpp struct Complex { - float real; - float imag; - - constexpr Complex(float r = 0.0f, float i = 0.0f) : real(r), imag(i) {} + double real, imag; - constexpr Complex operator+(const Complex& other) const - { - return Complex{real + other.real, imag + other.imag}; - } + constexpr Complex(double r = 0, double i = 0) : real(r), imag(i) {} - constexpr Complex operator-(const Complex& other) const - { - return Complex{real - other.real, imag - other.imag}; + constexpr Complex operator+(const Complex& other) const { + return {real + other.real, imag + other.imag}; } - constexpr Complex operator*(const Complex& other) const - { - return Complex{ + constexpr Complex operator*(const Complex& other) const { + return { real * other.real - imag * other.imag, real * other.imag + imag * other.real }; } - - constexpr float magnitude_squared() const - { - return real * real + imag * imag; - } - - constexpr bool operator==(const Complex& other) const - { - return real == other.real && imag == other.imag; - } }; -// 编译期复数运算 -constexpr Complex kI{0.0f, 1.0f}; // 虚数单位 i -constexpr Complex kI_Squared = kI * kI; // i^2 = -1 -static_assert(kI_Squared == Complex{-1.0f, 0.0f}, "i^2 should equal -1"); - -// 编译期生成复数序列(例如 FFT 的旋转因子) -template -constexpr Complex compute_twiddle_factor(std::size_t k) -{ - constexpr double kPi = 3.14159265358979323846; - double angle = -2.0 * kPi * static_cast(k) / static_cast(N); - // 用泰勒展开近似 cos 和 sin - double cos_val = 1.0 - angle * angle / 2.0 + angle*angle*angle*angle / 24.0; - double sin_val = angle - angle*angle*angle / 6.0 + angle*angle*angle*angle*angle / 120.0; - return Complex{static_cast(cos_val), static_cast(sin_val)}; -} - -constexpr Complex kTwiddle = compute_twiddle_factor<8>(1); -static_assert(kTwiddle.magnitude_squared() > 0.99f, "Twiddle factor should be on unit circle"); +// Compile-time complex arithmetic +constexpr Complex c1{1.0, 2.0}; +constexpr Complex c2{3.0, 4.0}; +constexpr Complex c3 = c1 + c2; // Evaluated at compile time + +// Generating FFT twiddle factors at compile time +constexpr Complex twiddle_factors[4] = { + Complex{1.0, 0.0}, + Complex{0.0, 1.0}, + Complex{-1.0, 0.0}, + Complex{0.0, -1.0} +}; ``` -This `Complex` class is entirely a literal type. Its constructor is `constexpr`, and so are all its operators and member functions. You can perform complex number calculations at compile time, generate FFT twiddle factor tables—all these calculation results will be optimized by the compiler into constants, directly embedded into the code or placed in the .rodata read-only data section (depending on the optimization level and usage). +This `Complex` class is entirely a literal type. Its constructor is `constexpr`, and so are all operators and member functions. You can perform complex arithmetic at compile time, generate FFT twiddle factor tables—all these results are optimized by the compiler into constants, directly embedded in the code or placed in the `.rodata` read-only data section (depending on optimization level and usage). -For example, under GCC 15.2.1 (`-std=c++20 -O2`), `kI_Squared` will be placed in the .rodata section as a constant, and accessing it is just a single memory load instruction. The `kTwiddleFactors` array will be fully compiled into the binary, and runtime access incurs no calculation overhead. If these values are inlined at their point of use, even the load instruction might be optimized away, becoming an immediate value. +For example, under GCC 15.2.1 (`-O3`), `c3` is placed in the `.rodata` section as a constant, and accessing it is just a single memory load instruction. The `twiddle_factors` array is fully compiled into the binary, and accessing it at runtime incurs no calculation overhead. If these values are inlined at the point of use, even the load instruction might be optimized away, becoming immediate values. ### Compile-Time Date Calculation -Another practical scenario is dates. Many protocols and time-related logic require validating the legitimacy of a date. We can move this validation to compile time. +Another practical scenario is dates. Many protocols and time-related logic require validating the legality of a date. We can move this validation to compile time. ```cpp struct Date { - int year; - int month; - int day; - - constexpr Date(int y, int m, int d) : year(y), month(m), day(d) - { - // 编译期验证日期合法性 - // 如果日期非法,触发编译错误(通过让表达式非恒常) - } - - constexpr bool is_leap_year() const - { - return (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0); - } - - constexpr int days_in_month() const - { - constexpr int kDays[] = { - 0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 - }; - if (month == 2 && is_leap_year()) { - return 29; + unsigned short year; + unsigned char month; + unsigned char day; + + constexpr Date(unsigned short y, unsigned char m, unsigned char d) + : year(y), month(m), day(d) { + // Compile-time validation + if (m < 1 || m > 12) { + throw "Invalid month"; // Exception in constexpr context is a compile error } - return kDays[month]; - } - - constexpr bool is_valid() const - { - if (month < 1 || month > 12) return false; - if (day < 1 || day > days_in_month()) return false; - if (year < 0) return false; - return true; + // ... (leap year and day validation logic omitted) } }; -constexpr Date kEpoch{1970, 1, 1}; -static_assert(kEpoch.is_valid()); -static_assert(!kEpoch.is_leap_year()); - -constexpr Date kY2K{2000, 1, 1}; -static_assert(kY2K.is_leap_year(), "2000 is a leap year (divisible by 400)"); - -constexpr Date kLeapDay{2024, 2, 29}; -static_assert(kLeapDay.is_valid(), "2024-02-29 is valid (2024 is a leap year)"); - -// constexpr Date kInvalid{2023, 2, 29}; // 编译时不会直接报错 -// 需要用 static_assert 显式检查: -// static_assert(Date{2023, 2, 29}.is_valid()); // 编译错误! +// constexpr Date d{2023, 13, 1}; // Compile error: Invalid month ``` -There is a key point here: the `constexpr` constructor itself will not throw an error just because of "logically unreasonable" values. You need to proactively trigger a compile-time error in the constructor (for example, using `throw`, since in a `constexpr` context an exception is a compilation error), or use `static_assert` combined with `is_valid()` to perform the check. +Here is a key point: the `constexpr` constructor itself does not report an error just because the value is "logically unreasonable." You need to actively trigger a compile-time error in the constructor (e.g., using `throw`, where an exception in a `constexpr` context is a compile error), or use `if` combined with `static_assert` to check. ### Compile-Time String Length -Having a member function return a compile-time usable value is also an important application of `constexpr`. For instance, a simple compile-time string wrapper class. +Making member functions return compile-time usable values is also an important application of `constexpr`. For example, a simple compile-time string wrapper class. ```cpp -#include - -struct ConstString { - const char* data; - std::size_t length; +struct ConstexprString { + const char* str; + std::size_t len; template - constexpr ConstString(const char (&str)[N]) : data(str), length(N - 1) - { - // N - 1 是因为字符串字面量的末尾有 '\0' - } - - constexpr char operator[](std::size_t i) const - { - return i < length ? data[i] : '\0'; - } + constexpr ConstexprString(const char (&s)[N]) : str(s), len(N - 1) {} - constexpr bool starts_with(char c) const - { - return length > 0 && data[0] == c; - } - - constexpr bool equals(const ConstString& other) const - { - if (length != other.length) return false; - for (std::size_t i = 0; i < length; ++i) { - if (data[i] != other.data[i]) return false; - } - return true; - } + constexpr std::size_t length() const { return len; } + constexpr char operator[](std::size_t i) const { return str[i]; } }; -constexpr ConstString kHello{"Hello"}; -static_assert(kHello.length == 5); -static_assert(kHello[0] == 'H'); -static_assert(kHello.starts_with('H')); -static_assert(kHello.equals(ConstString{"Hello"})); +constexpr ConstexprString msg = "Hello"; +static_assert(msg.length() == 5); ``` -This `ConstString` is essentially a simplified version of the `conststr` class from the official cppreference example. It does not own the string data; it merely holds a pointer and a length, but this is sufficient to perform many string operations at compile time. +This `ConstexprString` is essentially a simplified version of the `std::string_view` class from the cppreference official examples. It doesn't own the string data, it just holds a pointer and a length, but it is sufficient to perform many string operations at compile time. -## Step 4 — Restrictions Relaxed in C++14 +## Step 4 — Relaxations in C++14 As mentioned earlier, C++14 significantly relaxed the restrictions on `constexpr` constructors and member functions. Specifically for class types, the impact of these changes is: -In C++11, the function body of a `constexpr` constructor had to be empty—all initialization work could only be done through the member initializer list, and loops, conditional statements, or local variables were not allowed. This meant that if your construction logic was even slightly complex (such as needing to iterate over an array or set different values based on conditions), you had to find ways to work around the limitations using the ternary operator and recursive functions. +In C++11, the function body of a `constexpr` constructor had to be empty—all initialization work could only be done through member initializer lists; loops, conditional statements, or local variables were not allowed. This meant that if your construction logic was slightly complex (e.g., needing to iterate over an array or set different values based on conditions), you had to find ways to use ternary operators and recursive functions to bypass the limitations. -After C++14, you can write any statement permitted by `constexpr` inside the constructor. Local variables, `for` loops, and `if-else` are all fine. This made many previously impossible compile-time classes a reality. +After C++14, you can write any statement allowed by `constexpr` inside constructors. Local variables, `for` loops, `if` statements are all fine. This made many previously impossible compile-time classes a reality. ```cpp -// C++11 风格:构造函数体必须为空 -struct OldStyle { - int values[4]; - - // 只能用初始化列表 - constexpr OldStyle(int a, int b, int c, int d) - : values{a, b, c, d} {} -}; +// C++14 allows local variables and logic in constexpr constructors +struct LookupTable { + int data[256]; -// C++14 风格:构造函数体可以有逻辑 -struct NewStyle { - int values[4]; - int sum; - - constexpr NewStyle(int base) : values{}, sum(0) - { - for (int i = 0; i < 4; ++i) { - values[i] = base + i; - sum += values[i]; + constexpr LookupTable() : data{} { + for (int i = 0; i < 256; ++i) { + data[i] = i * i; // Calculate squares at compile time } } }; -constexpr NewStyle kObj{10}; -static_assert(kObj.values[0] == 10); -static_assert(kObj.values[3] == 13); -static_assert(kObj.sum == 46); // 10+11+12+13=46 +constexpr Table squares; // Fully constructed at compile time ``` -## Step 5 — constexpr Destructors (C++20 Preview) +## Step 5 — `constexpr` Destructors (C++20) -Before C++20, literal types required the destructor to be trivial. This meant you could not perform any cleanup work in the destructor. This restriction was lifted in C++20—you can now write `constexpr` destructors. +Before C++20, literal types required the destructor to be trivial. This meant you couldn't do any cleanup work in the destructor. This restriction was removed in C++20—you can write `constexpr` destructors. ```cpp -// C++20 才支持 -struct Resource { +struct ManagedBuffer { int* data; std::size_t size; - constexpr Resource(std::size_t n) : data{}, size(n) - { - // C++20 允许在 constexpr 上下文中使用 new - // 但分配的内存必须在常量求值结束前释放 - } + constexpr ManagedBuffer(std::size_t s) : size(s), data(new int[s]) {} - // C++20: constexpr 析构函数 - constexpr ~Resource() - { - // 清理逻辑 + constexpr ~ManagedBuffer() { + delete[] data; // Cleanup at compile time } }; + +// Usage in a constexpr context +constexpr auto create_buffer() { + ManagedBuffer buf{10}; // Allocates memory + // ... use buf ... + return; // buf is destroyed, memory freed +} ``` -This feature is fully supported by mainstream compilers in C++20. GCC 10+, Clang 10+, and MSVC 19.28+ all support `constexpr` destructors. For most embedded scenarios, the main significance of `constexpr` destructors is that they allow standard containers like `std::vector` and `std::string` to participate more fully in compile-time computation—you can construct containers at compile time, manipulate elements, and then destroy them at compile time. +This feature is fully supported by mainstream compilers in C++20. GCC 10+, Clang 10+, and MSVC 19.28+ all support `constexpr` destructors. For most embedded scenarios, the main significance of `constexpr` destructors is that standard containers like `std::vector` and `std::string` can participate more fully in compile-time computation—you can construct containers, manipulate elements, and destroy them at compile time. -It is worth mentioning in passing the further relaxation of `constexpr` in C++23: `constexpr` functions no longer require their return type and parameter types to be literal types (P2448R2), and non-literal-type local variables, `goto` statements, and labels are also allowed. This means that starting from C++23, there are very few restrictions on defining `constexpr` functions. Of course, to actually call (evaluate) these functions at compile time, they are still subject to the rules of constant expression evaluation—you simply have more freedom when writing the function body. +It is worth mentioning that C++23 further relaxed `constexpr`: `constexpr` functions no longer require return types and parameter types to be literal types (P2448R2), and non-literal type local variables, `goto` statements, and labels are also allowed. This means starting from C++23, there are very few restrictions on defining `constexpr` functions. Of course, to actually call (evaluate) these functions at compile time, they are still subject to constant expression evaluation rules—you just have more freedom in writing the function body. ## Practical Application: Compile-Time Configuration in Embedded Systems -In embedded development, peripheral configuration is usually a set of fixed parameters—baud rate, data bits, stop bits, parity, and so on. We can use literal types to package these configurations as compile-time constants. +In embedded development, peripheral configuration is usually a bunch of fixed parameters—baud rate, data bits, stop bits, parity, etc. We can use literal types to package these configurations into compile-time constants. ```cpp -enum class Parity { kNone, kEven, kOdd }; -enum class StopBits { kOne, kTwo }; - -struct UartConfig { - std::uint32_t baud_rate; - std::uint8_t data_bits; - StopBits stop_bits; - Parity parity; - - constexpr UartConfig(std::uint32_t baud, std::uint8_t data, - StopBits stop, Parity par) - : baud_rate(baud), data_bits(data), stop_bits(stop), parity(par) {} - - constexpr bool is_valid() const - { - if (baud_rate == 0) return false; - if (data_bits < 5 || data_bits > 9) return false; - return true; +struct UARTConfig { + uint32_t baud_rate; + uint8_t data_bits; + uint8_t stop_bits; + uint8_t parity; + + constexpr UARTConfig(uint32_t br, uint8_t db, uint8_t sb, uint8_t p) + : baud_rate(br), data_bits(db), stop_bits(sb), parity(p) { + // Compile-time validation + if (br == 0) throw "Baud rate cannot be zero"; + if (db < 5 || db > 9) throw "Invalid data bits"; } - constexpr std::uint32_t compute_brr(std::uint32_t clock_freq) const - { - // 简化的波特率寄存器值计算(STM32 风格) - return clock_freq / baud_rate; + // Calculate hardware register value at compile time + constexpr uint32_t get_control_reg() const { + return (1 << 0) | (data_bits << 12) | (parity << 9); } }; -// 常用配置的编译期常量 -constexpr UartConfig kDebugUart{115200, 8, StopBits::kOne, Parity::kNone}; -constexpr UartConfig kGpsUart{9600, 8, StopBits::kOne, Parity::kNone}; +// Compile-time configuration +constexpr UARTConfig uart_cfg{115200, 8, 1, 0}; -static_assert(kDebugUart.is_valid()); -static_assert(kDebugUart.compute_brr(72000000) == 625); // 72MHz / 115200 +// Runtime usage (just write the pre-calculated register value) +void init_uart() { + UART->CTRL = uart_cfg.get_control_reg(); + // ... +} ``` -`kDebugUart` and `kGpsUart` complete all validation and calculation at compile time. If someone changes the baud rate to 0 or the data bits to 3, `static_assert` will blow up at compile time. The baud rate register value is also pre-calculated, so at runtime we simply write it directly to the register. +`uart_cfg` and `get_control_reg` complete all validation and calculation at compile time. If someone changes the baud rate to 0 or data bits to 3, the `throw` statement will cause a compile-time explosion. The baud rate register value is also pre-calculated, so at runtime, we just write it directly to the register. ## Common Pitfalls ### Blocking by Non-Trivial Destructors -If your class has a non-trivial destructor (for example, it manually manages resources), it cannot be a literal type before C++20. Even if your constructor is `constexpr`, a destructor that is not `constexpr` (or trivial) will prevent compile-time usage. A common workaround is to declare the destructor as `= default`, letting the compiler generate a trivial destructor—provided your class truly does not need custom destruction logic. +If your class has a non-trivial destructor (e.g., it manually manages resources), it cannot be a literal type before C++20. Even if your constructor is `constexpr`, if the destructor is not `constexpr` (or trivial), it will block compile-time usage. A common workaround is to declare the destructor as `= default`, letting the compiler generate a trivial destructor—provided your class indeed doesn't need custom destruction logic. ### `mutable` Members -`mutable` data members can lead to some unexpected behavior. The `mutable` members of a `constexpr` object are treated as modifiable during compile-time evaluation, but this can cause compile-time evaluation to fail in certain contexts (because `mutable` breaks the semantic assumption that "the object is fully determined at compile time"). +`mutable` data members can lead to unexpected behavior. `mutable` members of a `constexpr` object are treated as modifiable during compile-time evaluation, but this can cause compile-time evaluation to fail in certain contexts (because `mutable` breaks the semantic assumption that "the object is fully determined at compile time"). ### Virtual Functions and Virtual Base Classes -Classes with virtual functions or virtual base classes can never be literal types (this remains true up to the current standard). If you need to use a type hierarchy at compile time, consider using CRTP (Curiously Recurring Template Pattern) to replace virtual functions. +Classes with virtual functions or virtual base classes can never be literal types (at least up to the current standard). If you need to use a type hierarchy at compile time, consider using CRTP (Curiously Recurring Template Pattern) to replace virtual functions. ## Summary -In this chapter, we covered the definition and constraints of literal types, how to write `constexpr` constructors, the use of `constexpr` member functions, and the gradual relaxation of these restrictions in C++14/C++20/C++23. The core takeaway is: as long as the memory layout and lifetime of your type can be fully determined at compile time, the compiler can construct and manipulate it at compile time. Types like compile-time complex numbers, dates, strings, and configuration structures can all become literal types, thereby participating in more complex compile-time computations. +In this chapter, we covered the definition and constraints of literal types, how to write `constexpr` constructors, the use of `constexpr` member functions, and the gradual relaxation of these restrictions in C++14/20/23. The key takeaway is: as long as your type's memory layout and lifetime can be fully determined at compile time, the compiler can construct and manipulate it then. Compile-time complex numbers, dates, strings, and configuration structures can all become literal types, thereby participating in more complex compile-time computations. -In the next chapter, we will introduce the `consteval` and `constinit` keywords added in C++20, and see how they precisely control the behavior of compile-time evaluation. +In the next chapter, we will introduce the `consteval` and `constinit` keywords added in C++20, and see how they precisely control compile-time evaluation behavior. ## Reference Resources diff --git a/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md b/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md index 5b2d65a13..ea52bc82d 100644 --- a/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md +++ b/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md @@ -1,6 +1,6 @@ --- -title: User-Defined Literal Basics -description: Raw/cooked forms of `operator""` and standard library literals +title: User-Defined Literal Fundamentals +description: operator"" Raw/Cooked Forms and Standard Library Literals chapter: 11 order: 1 tags: @@ -21,264 +21,178 @@ related: - UDL 实战 translation: source: documents/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md - source_hash: bf1662276299ecf1114d8fe0dac06e8ab11838f4108d20922adf6d969692ab9b - translated_at: '2026-05-26T11:36:34.727568+00:00' + source_hash: 79f96fecadfca38c1c66530fe58a6e4434c6ce14d6dc59e0fd46dcda20c1dd9e + translated_at: '2026-06-13T11:50:08.723988+00:00' engine: anthropic - token_count: 2330 + token_count: 2455 --- -# User-Defined Literal Basics +# Basics of User-Defined Literals -When writing embedded code, we often run into frustrating scenarios: does the 1000 in `delay(1000)` represent milliseconds or microseconds? Is `9600` or `115200` the correct baud rate? Does `1024` refer to bytes or words? These "magic numbers" are not only hard to read but also error-prone. Even worse, conversions between different units rely entirely on manual calculation, leaving plenty of room for mistakes. +When writing embedded code, I often encounter frustrating scenarios: Is the `1000` in `delay(1000)` in milliseconds or microseconds? Is `Serial.begin(9600)` actually 9600 or 115200? Is `buffer[512]` in bytes or words? These "magic numbers" are not only hard to understand but also error-prone. Even worse, conversions between different units rely entirely on manual calculation by the programmer, where a single slip-up can cause problems. -**User-defined literals (UDL)**, introduced in C++11, exist to solve this problem. They allow us to define custom literal suffixes, such as `1000_ms`, `9600_baud`, and `1024_bytes`, making code more intuitive and safer. All conversions happen at compile time, resulting in zero runtime overhead. +**User-defined literals (UDL)**, introduced in C++11, are designed to solve this problem. They allow us to define our own literal suffixes, such as `100_ms`, `3.3_V`, or `16_kB`, making code more intuitive and safer. Furthermore, all conversions can be completed at compile time, resulting in zero runtime overhead. ------ -## The Four Forms of operator"" +## Four Forms of `operator""` -We define user-defined literals through the `operator""` suffix operator. Depending on the parameter type, there are several main definition forms, corresponding to integer literals, floating-point literals, string literals, and character literals: +User-defined literals are defined via the `operator""` suffix operator. Based on different parameter types, there are several main definition forms, corresponding to integer literals, floating-point literals, string literals, and character literals: ```cpp -// 整数字面量(cooked 形式) -ReturnType operator""_suffix(unsigned long long value); - -// 浮点数字面量(cooked 形式) -ReturnType operator""_suffix(long double value); - -// 字符串字面量(raw 形式) -ReturnType operator""_suffix(const char* str, size_t length); - -// 字符字面量(cooked 形式) -ReturnType operator""_suffix(char c); +// Cooked integer: operator"" _suffix(unsigned long long int) +// Cooked floating: operator"" _suffix(long double) +// Raw character: operator"" _suffix(const char*, std::size_t) +// Raw character pack: operator"" _suffix(const char*) ``` -There are two pairs of concepts to distinguish here: **cooked** and **raw**. Cooked literals are those that the compiler has already parsed and converted. For integer and floating-point types, the compiler parses them into numeric types before passing them to `operator""`. Raw literals receive the original character sequence without any parsing by the compiler. String literals only support the raw form, while integer literals support both the cooked (`unsigned long long`) and raw (`const char*`) forms. +Here, we need to distinguish two pairs of concepts: **cooked** and **raw**. Cooked literals refer to literals that have already been parsed and converted by the compiler—for integer and floating-point types, the compiler parses them into numeric types before passing them to `operator""`. Raw literals receive the raw character sequence, and the compiler performs no parsing. String literals only support the raw form, while integer literals support both cooked (`unsigned long long int`) and raw (character sequence template) forms. -Let's start with a simple example: +Let's start with a simplest example: ```cpp -#include - -struct Milliseconds { - std::uint64_t value; - constexpr explicit Milliseconds(std::uint64_t v) : value(v) {} +struct Duration { + unsigned long long int microseconds; }; -constexpr Milliseconds operator""_ms(unsigned long long v) { - return Milliseconds{v}; +constexpr Duration operator"" _us(unsigned long long int us) { + return Duration{us}; } -void delay(Milliseconds ms); +void delay(Duration d); -void example() { - delay(500_ms); // 清晰:500 毫秒 - // delay(500); // 编译错误!必须明确单位 -} +// Usage +delay(1000_us); // 1000 microseconds ``` -`1000_ms` is parsed by the compiler, which calls `operator""_ms`, returning a `Milliseconds` object. The function signature `explicit Milliseconds(uint32_t ms)` only accepts parameters with units—bare integers won't compile, and the compiler will directly report an error. This is where type safety comes from. +`1000_us` is parsed by the compiler, which calls `operator""_us`, returning a `Duration` object. The function signature `void delay(Duration d)` only accepts parameters with units—you cannot pass a bare integer, and the compiler will report an error directly. This is the source of type safety. ### Integer and Floating-Point Overloads -We can define separate overloads for integer and floating-point types, allowing the same suffix to behave differently in different contexts: +You can define overloads for integer and floating-point types separately, allowing the same suffix to behave differently in different contexts: ```cpp -struct Frequency { - std::uint32_t hz; - constexpr explicit Frequency(std::uint32_t v) : hz(v) {} -}; - -// 整数版本:100_Hz -constexpr Frequency operator""_Hz(unsigned long long value) { - return Frequency{static_cast(value)}; +void operator"" _temp(long double kelvin) { + // Handle floating-point temperature } -// 浮点版本:1.5_kHz -constexpr Frequency operator""_kHz(long double value) { - return Frequency{static_cast(value * 1000.0)}; -} - -void example() { - auto f1 = 100_Hz; // 整型版本,f1.hz = 100 - auto f2 = 1.5_kHz; // 浮点版本,f2.hz = 1500 +void operator"" _temp(unsigned long long int kelvin) { + // Handle integer temperature } ``` ### String Literals -String literal operators receive a pointer to the string and its length, which can be used for compile-time string processing: +String literal operators receive a pointer to a string and its length, which can be used for compile-time string processing: ```cpp -#include - -/// FNV-1a 哈希(编译期) -constexpr std::uint32_t hash_string( - const char* str, std::uint32_t value = 2166136261u) { - return *str - ? hash_string(str + 1, - (value ^ static_cast(*str)) * 16777619u) - : value; -} - -constexpr std::uint32_t operator""_hash( - const char* str, std::size_t len) { - return hash_string(str); +constexpr std::size_t operator"" _hash(const char* str, std::size_t len) { + return std::hash{}(std::string_view{str, len}); } -void example() { - constexpr auto id1 = "temperature"_hash; - constexpr auto id2 = "humidity"_hash; - static_assert(id1 != id2); -} +// Usage +constexpr auto id = "sensor_start"_hash; // Compile-time hash ``` -In embedded development, we can use this to implement efficient event IDs, message type identifiers, and more—strings are converted to integers at compile time, achieving zero runtime overhead. +In embedded systems, this can be used to implement efficient event IDs and message type identifiers—strings are converted to integers at compile time, with zero runtime overhead. ### Raw Integer Literals -Integer literals also have a raw form that accepts a `const char*`, allowing us to handle formats not natively supported by the compiler: +Integer literals also have a raw form, accepting a character sequence template parameter, allowing you to handle formats not natively supported by the compiler: ```cpp -#include - -struct Binary { - std::uint64_t value; -}; - -constexpr Binary operator""_bin(const char* str, std::size_t length) { - std::uint64_t value = 0; - for (std::size_t i = 0; i < length; ++i) { - value = value * 2; - if (str[i] == '1') value += 1; - } - return Binary{value}; +template +constexpr unsigned long long int operator"" _bin() { + // Parse Chars... as binary + return parse_binary(); } -void example() { - auto b1 = 1010_bin; // 10 - auto b2 = 11111111_bin; // 255 -} +// Usage +auto value = 1010_bin; // Custom binary literal ``` -This raw form was especially useful before C++14, since binary literals (`0b...`) were only introduced in C++14. Although the standard now supports them, the raw form can still be used to implement custom base conversions. +This raw form was very useful before C++14—because C++14 introduced the `0b` binary literal. Although the standard now supports it, the raw form can still be used to implement custom base conversions. ------ ## Standard Library Literals -C++14 introduced a batch of commonly used literal suffixes into the standard library. To use them, we need to bring the corresponding namespaces into scope with `using namespace`. These suffixes do not have an underscore prefix—because they reside within the `std::` namespace, they are reserved for the standard library. +C++14 introduced a batch of commonly used literal suffixes into the standard library. To use them, you need to introduce the corresponding namespaces via `using namespace`. These suffixes do not have an underscore prefix—because they are within the `std::literals` namespace, they are reserved for the standard library. ### chrono Literals (C++14) ```cpp -#include - -using namespace std::chrono_literals; - -void example() { - auto t1 = 1s; // std::chrono::seconds{1} - auto t2 = 500ms; // std::chrono::milliseconds{500} - auto t3 = 2us; // std::chrono::microseconds{2} - auto t4 = 100ns; // std::chrono::nanoseconds{100} - auto t5 = 1min; // std::chrono::minutes{1} - auto t6 = 1h; // std::chrono::hours{1} +using namespace std::literals::chrono_literals; - auto total = 1s + 500ms; // 1500ms -} +auto timeout = 100ms; +auto interval = 5s; ``` ### string Literals (C++14) ```cpp -#include - -using namespace std::string_literals; +using namespace std::literals::string_literals; -void example() { - auto s1 = "hello"s; // std::string - auto s2 = L"wide"s; // std::wstring - auto s3 = u"utf16"s; // std::u16string - auto s4 = U"utf32"s; // std::u32string -} +auto s = "hello"s; // std::string ``` ### complex Literals (C++14) ```cpp -#include - -using namespace std::complex_literals; +using namespace std::literals::complex_literals; -void example() { - auto c1 = 3.0 + 4.0i; // std::complex{3.0, 4.0} - auto c2 = 1.0i; // 虚数单位 -} +auto c = 3.0i; // Imaginary number ``` ### string_view Literals (C++17) ```cpp -#include - -using namespace std::string_view_literals; +using namespace std::literals::string_view_literals; -void example() { - auto sv = "hello"sv; // std::string_view -} +auto sv = "data"sv; // std::string_view ``` ------ ## Naming Rules -Regarding the naming of UDL suffixes, the C++ standard has clear rules: +Regarding the naming of UDL suffixes, the C++ standard has clear regulations: -**Suffixes not starting with an underscore are reserved for the standard library**. Therefore, suffixes without underscores like `s`, `ms`, and `i` can only be defined by the standard library. User-defined suffixes **must start with an underscore (`_`)**, such as `_ms`, `_us`, `_hz`. +**Suffixes not starting with an underscore are reserved for the standard library**. Therefore, suffixes like `ms`, `s`, `il`, which do not require an underscore, can only be defined by the standard library. User-defined suffixes **must start with an underscore**, such as `_ms`, `_Hz`, `_kB`. -Additionally, identifiers starting with `__` (double underscore) or containing `__` are reserved for the implementation (compiler) and must not be used. +Additionally, identifiers starting with `__` (double underscore) or containing `__` are reserved for the implementation (compiler) and cannot be used. -The recommended naming style is an underscore followed by a short but clear suffix: `_ms`, `_us`, `_hz`, `_baud`, `_bytes`, `_kb`, `_mv`, `_percent`. When defining them in header files, always place them inside a namespace to avoid polluting the global namespace: +The recommended naming style is to use an underscore `_` followed by a short but clear suffix: `_ms`, `_us`, `_Hz`, `_ohm`, `_V`, `_A`, `_mA`, `_kB`. When defining in a header file, be sure to place them within a namespace to avoid polluting the global namespace: ```cpp -namespace mylib::literals { - constexpr Milliseconds operator""_ms(unsigned long long v) { - return Milliseconds{v}; - } +namespace my_literals { + constexpr Duration operator"" _ms(unsigned long long int); } - -// 使用时 -using namespace mylib::literals; -auto t = 500_ms; +using namespace my_literals; ``` ------ -## Compile-Time vs. Runtime +## Compile-Time vs Runtime -UDLs combined with `constexpr` enable pure compile-time unit conversion, which is one of their most powerful features. Always mark literal operators as `constexpr` so that `1000_ms` is optimized into a constant by the compiler, with zero runtime overhead: +UDL combined with `constexpr` can achieve pure compile-time unit conversion, which is one of its most powerful features. Always mark literal operators as `constexpr`, so that `1000_ms` is optimized by the compiler into a constant with no runtime overhead: ```cpp -constexpr Milliseconds operator""_ms(unsigned long long v) { - return Milliseconds{v}; +constexpr Duration operator"" _ms(unsigned long long int val) { + return Duration{val * 1000}; // Compile-time multiplication } -constexpr auto startup_delay = 100_ms; -// startup_delay 在编译期就已经构造好了 -// 生成的代码等价于直接写 Milliseconds{100} +// Usage +constexpr auto d = 5_ms; // No runtime calculation ``` -If we don't mark it `constexpr`, the literal operator becomes a normal function call. Although the overhead is minimal after inlining, we lose the ability to perform compile-time calculations and can no longer use it in `static_assert` or as template arguments. +If you don't mark it `constexpr`, the literal operator becomes a normal function call—although the overhead is small after inlining, you lose the ability for compile-time computation and cannot use it for `constexpr` variables or template parameters. C++20 introduced `consteval`, which forces the literal operator to execute only at compile time: ```cpp -consteval Milliseconds operator""_ms(unsigned long long v) { - return Milliseconds{v}; +consteval Duration operator"" _ms(unsigned long long int val) { + return Duration{val * 1000}; } - -constexpr auto t1 = 100_ms; // OK,编译期执行 -// 注意:consteval 要求字面量必须是编译期常量 -// 例如:std::stoi("123")_ms 会编译失败,因为 stoi 不是 constexpr ``` ------ @@ -287,133 +201,80 @@ constexpr auto t1 = 100_ms; // OK,编译期执行 ### Suffix Naming Conflicts -If we define a `_ms` suffix in a header file, and another library defines a similarly named `_ms` with a different implementation, we will encounter ambiguity at link time. The solution is to use a unique prefix for suffixes, or always use fully qualified namespace specifiers. +If you define a `_ms` suffix in a header file, and another library also defines a `_ms` with a different implementation, ambiguity will arise during linking. The solution is to use a unique prefix for your suffixes or always use full namespace qualification. ### Floating-Point Precision -Floating-point UDLs can have precision issues. `0.1f` might not equal `0.1` in floating-point arithmetic. The solution is to use integers for representation—for example, storing millivolts instead of volts: +Floating-point UDLs may have precision issues. `0.1` in floating-point arithmetic may not exactly equal `0.1`. The solution is to use integers for representation—for example, storing millivolts instead of volts: ```cpp -struct Voltage { - std::int64_t millivolts; // 用整数存储 -}; - -constexpr Voltage operator""_V(long double value) { - return Voltage{ - static_cast(value * 1000.0 + 0.5)}; +constexpr int operator"" _mV(long double val) { + return static_cast(val * 1000); } - -constexpr auto v1 = 0.1_V + 0.2_V; -constexpr auto v2 = 0.3_V; -static_assert(v1.millivolts == v2.millivolts); // OK ``` ### Operator Precedence ```cpp -auto x = 100_km / 2 * 3; // (100_km / 2) * 3 = 150_km -auto y = 100_km / (2 * 3); // 100_km / 6 ≈ 16.67_km +auto result = 5_ms + 100_us; // OK +auto result = 5_ms * 2; // OK ``` -Literal operators have the same precedence as normal operators, associating from left to right. We need to be careful to add parentheses when writing complex expressions. +Literal operators have the same precedence as normal operators and associate left-to-right. Pay attention to parentheses when writing complex expressions. ### Integer Overflow -Unit conversions involving large numbers might overflow. If a UDL involves multiplication (such as multiplying by 1,000,000 in `_us`), we need to consider the upper limit of `uint64_t` (approximately 1.8 * 10^19) and document the range limits. Note that integer overflow is **undefined behavior (UB)** in C++, and the compiler might not emit a warning. +Unit conversion of large numbers might overflow. If your UDL involves multiplication (like multiplying by 1,000,000 in `_s`), consider the upper limit of `unsigned long long int` (approx 1.8 * 10^19) and note the range limitations in your documentation. Note that integer overflow is **undefined behavior** in C++, and the compiler may not issue a warning. ------ ## General Examples -Finally, let's look at a few commonly used literal definitions that we can directly drop into our projects: +Finally, let's look at several commonly used literal definitions that you can directly apply to your project: ```cpp -#include - -namespace mylib::literals { - -// ===== 时间单位 ===== -struct Milliseconds { std::uint64_t value; }; -struct Microseconds { std::uint64_t value; }; -struct Seconds { std::uint64_t value; }; - -constexpr Milliseconds operator""_ms(unsigned long long v) { - return Milliseconds{v}; -} -constexpr Microseconds operator""_us(unsigned long long v) { - return Microseconds{v}; -} -constexpr Seconds operator""_s(unsigned long long v) { - return Seconds{v}; -} - -// ===== 频率单位 ===== -struct Hertz { std::uint32_t value; }; - -constexpr Hertz operator""_Hz(unsigned long long v) { - return Hertz{static_cast(v)}; -} -constexpr Hertz operator""_kHz(long double v) { - return Hertz{static_cast(v * 1000.0)}; -} -constexpr Hertz operator""_MHz(long double v) { - return Hertz{static_cast(v * 1000000.0)}; -} - -// ===== 内存单位 ===== -struct Bytes { std::uint64_t value; }; - -constexpr Bytes operator""_B(unsigned long long v) { - return Bytes{v}; -} -constexpr Bytes operator""_KiB(unsigned long long v) { - return Bytes{v * 1024}; -} -constexpr Bytes operator""_MiB(unsigned long long v) { - return Bytes{v * 1024 * 1024}; -} - -// ===== 温度单位 ===== -struct Celsius { double value; }; -struct Fahrenheit { double value; }; - -constexpr Celsius operator""_degC(long double v) { - return Celsius{static_cast(v)}; -} -constexpr Fahrenheit operator""_degF(long double v) { - return Fahrenheit{static_cast(v)}; -} -constexpr Celsius operator""_degK(long double v) { - return Celsius{static_cast(v - 273.15)}; +namespace app { + namespace literals { + // Time + constexpr uint64_t operator"" _hz(unsigned long long int hz) { return hz; } + constexpr uint64_t operator"" _khz(unsigned long long int khz) { return khz * 1000; } + constexpr uint64_t operator"" _mhz(unsigned long long int mhz) { return mhz * 1000000; } + + // Voltage + constexpr uint32_t operator"" _mv(long double v) { return static_cast(v * 1000); } + + // Memory + constexpr size_t operator"" _kb(unsigned long long int kb) { return kb * 1024; } + constexpr size_t operator"" _mb(unsigned long long int mb) { return mb * 1024 * 1024; } + } } +using namespace app::literals; -// ===== 角度单位 ===== -struct Degrees { double value; }; +// Usage +I2C_Init(400_khz); +ADC_SetRef(3300_mv); // 3.3V in mV +uint8_t buffer[64_kb]; +``` -constexpr Degrees operator""_deg(long double v) { - return Degrees{static_cast(v)}; -} -constexpr Degrees operator""_rad(long double v) { - return Degrees{static_cast(v * 180.0 / 3.14159265358979323846)}; -} +When using them: -} // namespace mylib::literals +```cpp +Timer_SetPrescaler(72_mhz); +UART_Init(115200_hz); ``` -Usage: +Every number is followed by its unit, so the code almost needs no comments (it's truly satisfying to look at!). -```cpp -using namespace mylib::literals; +## Summary -auto delay_time = 100_ms; -auto sys_clock = 72_MHz; -auto buffer_size = 4_KiB; -auto room_temp = 25.0_degC; -auto angle = 3.14159_rad; -``` +User-defined literals essentially use compile-time capabilities to dress "bare numbers" in units—`1000_hz`, `3.3_v`, `64_kb` are understood at a glance, and all conversions are completed at compile time with zero runtime overhead. Remember these key points: -Every number carries its unit right beside it, making the code almost self-documenting (it really is satisfying to read!) +- `operator""` has four cooked forms (`unsigned long long int` / `long double` / `char` / `const char*`) plus one raw form (character sequence template). Daily use of cooked is sufficient; only use raw when you need to parse custom numeric syntax (binary, thousand separators). +- Suffixes **must start with an underscore** (`_ms`). Suffixes without underscores (`ms`) are reserved for the standard library; using them yourself will eventually lead to trouble. +- Use the existing ones in the standard library first (`std::literals`'s `ms`, `s`, `sv`), and define your own only if they are not enough. +- Literals are compile-time constants, so you can safely put them into `constexpr`, template parameters, and array sizes. +The cost is almost zero, and the benefit is eliminating the question "what unit is this number?" from code reviews. How to organize a full set of literal libraries in a real project will be expanded in the UDL in Practice article. ## Reference Resources diff --git a/documents/en/vol4-advanced/03-empty-base-optimization.md b/documents/en/vol4-advanced/03-empty-base-optimization.md new file mode 100644 index 000000000..04d4da62d --- /dev/null +++ b/documents/en/vol4-advanced/03-empty-base-optimization.md @@ -0,0 +1,127 @@ +--- +chapter: 11 +cpp_standard: +- 11 +- 14 +- 17 +- 20 +description: Introduces empty base optimization (EBO) and C++20 [[no_unique_address]] +difficulty: intermediate +order: 6 +platform: host +prerequisites: +- 'Chapter 2: 零开销抽象' +reading_time_minutes: 6 +tags: +- host +- cpp-modern +- intermediate +- 零开销抽象 +title: EBO (Empty Base Optimization) +translation: + source: documents/vol4-advanced/03-empty-base-optimization.md + source_hash: 3489c25ee12064211c70c3b43127eeb31d5a3080a8648c62ff6c3f9258fe0ee1 + translated_at: '2026-06-13T11:50:22.052740+00:00' + engine: anthropic + token_count: 840 +--- +# Empty Base Optimization (EBO): A C++ Slimming Technique + +There is a low-profile yet efficient memory optimization that silently saves bytes for you in places you rarely notice—**Empty Base Optimization (EBO)**. When writing libraries, we often use empty classes as "policies, tags, or stateless behavior objects." EBO allows these stateless base classes to be squeezed out of the object layout, saving space and improving locality. + +------ + +## TL;DR + +- **EBO allows the compiler to omit the storage for empty base class subobjects (i.e., they take up no extra bytes), thereby reducing the size of the derived class.** +- **Empty member variables cannot be compressed by EBO by default, but C++20 introduced `[[no_unique_address]]` to achieve similar compression effects for members.** +- **Do not rely on object address uniqueness to identify empty subobjects—their addresses might be identical (a permitted side effect of this optimization), and assumptions about addresses can lead to bugs.** +- In practice: library implementations often use "inheriting from empty policy classes" or "compressed pair" tricks. C++20 makes things cleaner, but understanding traditional EBO is still very useful. + +------ + +## Concepts: Starting with a Real-World Analogy + +Imagine a container object with two members: one is a warehouse that actually holds things (like `std::vector` or a pointer), and the other is an empty "tag"—representing behavior only, with no data. Intuitively, you might allocate space for each member, but the language standard allows the compiler to place the "empty tag" base class subobject in a location that takes up no extra space (for example, reusing the first byte of the derived object). This makes the derived object smaller and more cache-friendly—this is the core of EBO. + +The standard imposes the requirement that "the most derived object must have non-zero size" on the most derived object itself, but **base class subobjects are not subject to this restriction**. The compiler can treat the size of an empty base class subobject as zero (i.e., occupying no extra bytes). This is the legal basis for EBO. + +------ + +## Simple Example + +```cpp +struct Empty {}; + +struct DataMember { + Empty e; + int x; +}; + +struct BaseInherit : Empty { + int x; +}; + +static_assert(sizeof(DataMember) > sizeof(int)); +static_assert(sizeof(BaseInherit) == sizeof(int)); // EBO usually applies here +``` + +In the example above, `e` in `DataMember` is a data member. According to language rules, it must occupy non-zero bytes (to ensure semantics like array indexing work). However, `BaseInherit` inherits from `Empty` as a base class. The compiler can "compress" it into `BaseInherit`'s layout, so `sizeof(BaseInherit)` typically equals `sizeof(int)` (details may vary by compiler/ABI). + +------ + +## Why Do We Often See the "Inherit from Empty Class" Pattern in the STL/Libraries? + +In the standard library, types like allocators, comparators, and deleters are often stateless empty classes. If used as members, they waste space. If used as base classes (usually **private inheritance**), EBO is enabled, saving object size. Many implementations wrap scenarios like "pointer + empty deleter" into "compressed pair" or similar utilities to achieve minimal object size. Microsoft's STL blog and other implementations demonstrate the prevalence of this approach. + +------ + +## C++20: `[[no_unique_address]]` Makes "Empty Member Optimization" Formal and Safe + +Traditional EBO can only be achieved through inheritance (members cannot be compressed). The `[[no_unique_address]]` attribute introduced in C++20 allows **members** to share addresses with other subobjects (i.e., allowing zero-size semantics), achieving EBO-like effects with member syntax. This makes the code more intuitive and semantically clearer. For example: + +```cpp +struct Modern { + [[no_unique_address]] Empty e; + int x; +}; +// sizeof(Modern) is likely equal to sizeof(int) +``` + +This looks better than private inheritance and avoids potential interface exposure brought by inheritance. cppreference and some implementation articles summarize the semantics and constraints of `[[no_unique_address]]`. It is highly recommended to prioritize this when C++20 is available. + +------ + +## Common Misconceptions and Pitfalls (Must Read) + +- **"Empty class subobjects definitely don't have an address"—Wrong.** The standard allows base class subobjects to share the starting address with the most derived object. This means the address of a base class subobject might be the same as another subobject (or the object as a whole). Do not write code that relies on the uniqueness of subobject addresses. +- **Why can't `std::unique_ptr` directly utilize EBO?** Because `std::unique_ptr` uses the deleter and pointer as **members**, not empty base classes. Traditional EBO cannot apply to members (unless using `[[no_unique_address]]` or changing the implementation to a compressed-pair style). This is why internal implementation tricks like "compressed pair" exist. +- **Multiple empty base classes can sometimes interfere with each other**: If you inherit from multiple empty types, the compiler will try to apply EBO for them. However, in certain cases (such as duplicate base types, or identical types caused by ABI or nested templates), the optimization may be restricted. A common practice is to make each empty base class type "unique" to the compiler (e.g., via template parameterization) to ensure compression takes effect. Some people call this "making base class types distinct." + +------ + +## Practical Advice + +1. **Don't optimize prematurely by default**: It's fine to write policy classes as empty classes using members or inheritance; prioritize readability. +2. **If minimal memory is required or you are implementing libraries (like smart pointers, containers), prioritize `[[no_unique_address]]` (C++20) or controlled private inheritance EBO tricks.** C++20 makes the code more intuitive. +3. **Don't rely on object or subobject address uniqueness**: When writing debugging, serialization, or comparison logic, avoid using addresses to distinguish empty subobjects. Addresses might be identical, and the standard permits this reuse. + +------ + +## Online Demo + +Run the EBO example online to compare the `sizeof` changes when an empty class is used as a member versus a base class: + + + +## Summary + +EBO is a "visible yet subtle" micro-optimization in C++ that stops empty policy classes from wasting bytes. Historically, we implemented EBO using private inheritance. Modern C++ (C++20) uses `[[no_unique_address]]` to compress empty members as well, making code more intuitive and safe. In actual engineering, prioritize writing clear, maintainable code: when object size is sensitive, use tricks like EBO, `[[no_unique_address]]`, or compressed-pair to manually optimize, and verify the behavior on the target compiler. diff --git a/documents/en/vol4-advanced/05-spaceship-operator.md b/documents/en/vol4-advanced/05-spaceship-operator.md index a1e635f80..d9fe648a5 100644 --- a/documents/en/vol4-advanced/05-spaceship-operator.md +++ b/documents/en/vol4-advanced/05-spaceship-operator.md @@ -1,7 +1,7 @@ --- -title: Three-Way Comparison Operator (C++20 Spaceship Operator) -description: 'A detailed guide to the C++20 spaceship operator: simplifying comparison - logic for custom types' +title: Three-way comparison operator (C++20 Spaceship Operator) +description: 'Detailed Explanation of the C++20 Three-Way Comparison Operator: Simplifying + Comparison Logic for Custom Types' chapter: 11 order: 5 tags: @@ -18,16 +18,16 @@ cpp_standard: platform: host translation: source: documents/vol4-advanced/05-spaceship-operator.md - source_hash: 309aec88ffa5b2e75764586a11abacfb46df85a90774f10b48f3e9ce25333e28 - translated_at: '2026-05-26T11:39:30.086533+00:00' + source_hash: 968dba94e12efb78827b9f24621a35362acdf36dd79af278d44399a7a87cc4f6 + translated_at: '2026-06-13T11:50:40.195404+00:00' engine: anthropic - token_count: 7027 + token_count: 7328 --- -# Modern C++ for Embedded Systems — Three-Way Comparison Operator +# Modern Embedded C++ Development — Three-Way Comparison Operator ## Introduction -Have you ever struggled with comparison operators when writing embedded code? +Have you ever found yourself frustrated with comparison operators while writing embedded code? ```cpp class SensorReading { @@ -69,21 +69,21 @@ public: }; ``` -This is a disaster! To implement a fully sortable type, we need to write six comparison operators, and they have complex interdependencies. Worse, if we modify member variables, we must update all these operators in sync. +This is a disaster! To implement a fully sortable type, you need to write six comparison operators, and they have complex interdependencies. Worse yet, if you modify member variables, you must synchronize all these operators. -The **three-way comparison operator** introduced in C++20, commonly known as the **spaceship operator** (Spaceship Operator `<=>`), was created to solve this problem. +The **Three-way Comparison Operator** introduced in C++20, commonly known as the **Spaceship Operator** (`<=>`), was designed to solve this problem. -> In a nutshell: **The three-way comparison operator automatically generates all six comparison operators from a single definition, drastically simplifying comparison logic for custom types.** +> TL;DR: **The three-way comparison operator automatically generates all six comparison operators with a single definition, significantly simplifying comparison logic for custom types.** In embedded development, this feature is particularly useful: 1. Sorting sensor data by time or priority -2. Comparing firmware version numbers (complex versions with letter suffixes) +2. Firmware version comparison (complex versions with alphanumeric suffixes) 3. Lexicographical comparison of configuration parameters -4. Sorting tasks in a priority queue +4. Task sorting in priority queues ------ -**Warning**: As of 2024, GCC 10+, Clang 10+, and MSVC 2019+ fully support the three-way comparison operator. If we are using an older compiler, we may need to upgrade or use an alternative approach. +**Warning**: As of 2024, GCC 10+, Clang 10+, and MSVC 2019+ fully support the three-way comparison operator. If your compiler is older, you may need to upgrade or use an alternative solution. ------ @@ -108,7 +108,7 @@ struct Point { }; ``` -### Return Type +### Return Value Type The return value of the three-way comparison operator is not `bool`, but rather a "comparison category" representing the result: @@ -128,7 +128,7 @@ else { /* a > b */ } ### Testing Comparison Results -The returned comparison category can be compared against 0, or we can use named methods: +The returned comparison category can be compared against 0, or you can use named methods: ```cpp #include @@ -151,15 +151,15 @@ int main() { ``` ------ -**Best Practice**: Use `<`, `==`, and `>` directly to evaluate comparison results, rather than calling named methods. This keeps the code more concise and works with all comparison categories. +**Best Practice**: Use `<`, `==`, and `>` directly to judge comparison results instead of calling named methods. This keeps code concise and works for all comparison categories. ------ -## Auto-Generating Comparison Functions +## Automatic Generation of Comparison Functions -### Using =default to Auto-Generate +### Automatic Generation using =default -The simplest approach is to use `= default` to let the compiler automatically generate all comparison operators: +The simplest usage is to use `= default` to let the compiler automatically generate all comparison operators: ```cpp #include @@ -178,7 +178,7 @@ struct SensorReading { }; ``` -Now we can use all comparison operators: +Now you can use all comparison operators: ```cpp SensorReading s1{1, 100, 1000}; @@ -204,7 +204,7 @@ std::sort(sensors.begin(), sensors.end()); ### Comparison Order -The default-generated `<=>` performs lexicographical comparison in the **order of member declaration**: +The default generated `<=>` performs lexicographical comparison according to the **member declaration order**: ```cpp struct Version { @@ -227,22 +227,22 @@ Version v3{1, 3, 0}; ``` ------ -**Note**: The order of member variables matters! If we want to compare in a specific order, we need to adjust the declaration order of the member variables. +**Note**: The order of member variables matters! If you wish to compare in a specific order, you need to adjust the declaration order of the member variables. ------ -## Comparison Categories in Detail +## Deep Dive into Comparison Categories -C++20 defines three comparison categories to represent different strengths of comparison relationships. +C++20 defines three comparison categories to represent comparison relationships of different strengths. ### strong_ordering: Strong Ordering -`strong_ordering` represents the strongest comparison relationship, with the following properties: +`strong_ordering` represents the strongest comparison relationship with the following properties: 1. **Equivalence implies equality**: `a == b` if and only if all members of `a` and `b` are equal -2. **Substitutability**: When `a == b`, `f(a) == f(b)` holds for any function `f` +2. **Substitutability**: If `a == b`, then `f(a) == f(b)` holds for any function `f` -Use cases: integers, strings, simple value types +Use cases: Integers, strings, simple value types ```cpp #include @@ -268,20 +268,20 @@ static_assert((c <=> a) == std::strong_ordering::greater); `std::strong_ordering` has three possible values: | Value | Meaning | -|-------|---------| +|-----|------| | `std::strong_ordering::less` | Less than | | `std::strong_ordering::equal` | Equal | | `std::strong_ordering::greater` | Greater than | -| `std::strong_ordering::equivalent` | Equivalent (for strong ordering, identical to equal) | +| `std::strong_ordering::equivalent` | Equivalent (for strong ordering, equivalent to equal) | ### partial_ordering: Partial Ordering -`partial_ordering` represents cases where values may be "incomparable": +`partial_ordering` represents cases where values might be "incomparable": -1. Some values may not be comparable to each other (e.g., `NaN`) +1. Some values cannot be compared (e.g., `NaN`) 2. Equivalence does not imply equality -Use cases: floating-point numbers (due to `NaN`), ranges with permissible values +Use cases: Floating-point numbers (existence of `NaN`), ranges with allowed values ```cpp #include @@ -311,7 +311,7 @@ static_assert((a <=> b) == std::partial_ordering::less); `std::partial_ordering` has four possible values: | Value | Meaning | -|-------|---------| +|-----|------| | `std::partial_ordering::less` | Less than | | `std::partial_ordering::equivalent` | Equivalent | | `std::partial_ordering::greater` | Greater than | @@ -319,12 +319,12 @@ static_assert((a <=> b) == std::partial_ordering::less); ### weak_ordering: Weak Ordering -`weak_ordering` falls between strong ordering and partial ordering: +`weak_ordering` falls between strong and partial ordering: 1. Equivalence does not imply equality (there may be indistinguishable alternative representations) -2. But all values are comparable (no `unordered` exists) +2. But all values are comparable (no `unordered`) -Use cases: case-insensitive strings, comparisons that ignore certain fields +Use cases: Case-insensitive strings, comparisons ignoring certain fields ```cpp #include @@ -373,12 +373,12 @@ static_assert(!(s1 == s2)); // 不相等! `std::weak_ordering` has three possible values: | Value | Meaning | -|-------|---------| +|-----|------| | `std::weak_ordering::less` | Less than | | `std::weak_ordering::equivalent` | Equivalent | | `std::weak_ordering::greater` | Greater than | -### Choosing Among the Three Comparison Categories +### Choosing Between Comparison Categories ```cpp #include @@ -440,11 +440,11 @@ graph TD ``` ------ -**Important**: When using `= default`, the compiler automatically selects the most appropriate comparison category based on the member types. If all members support `strong_ordering`, the generated result is `strong_ordering`. +**Important**: When using `= default`, the compiler automatically selects the most appropriate comparison category based on member types. If all members support `strong_ordering`, the generated result is `strong_ordering`. ------ -## Practical Embedded Scenarios +## Embedded Scenarios in Practice ### Scenario 1: Sensor Data Priority Sorting @@ -513,9 +513,9 @@ void message_queue_example() { } ``` -### Scenario 2: Firmware Version Number Comparison +### Scenario 2: Firmware Version Comparison -Firmware version numbers can have complex formats, such as letter suffixes: +Firmware version numbers may have complex formats, such as alphanumeric suffixes: ```cpp #include @@ -595,7 +595,7 @@ void version_comparison() { ### Scenario 3: Configuration Parameter Comparison (Allowing Partial Equality) -In configuration systems, we might want to compare only certain key fields: +In configuration systems, we might only want to compare specific key fields: ```cpp #include @@ -661,7 +661,7 @@ void config_example() { ### Scenario 4: Sensor Data with NaN -Some sensors may return invalid data (similar to the NaN concept): +Some sensors might return invalid data (similar to the NaN concept): ```cpp #include @@ -727,7 +727,7 @@ void sensor_with_invalid_values() { ### Scenario 5: Multi-Level Sensor Alerts -Alert systems need to sort across multiple dimensions: +Alert systems need sorting across multiple dimensions: ```cpp #include @@ -806,11 +806,11 @@ void alarm_system() { ------ -## Custom Three-Way Comparison Implementations +## Custom Three-Way Comparison Implementation -### Manually Implementing Multi-Field Comparisons +### Manual Multi-Field Comparison -When the default lexicographical order doesn't meet our needs, we need to implement it manually: +When the default lexicographical order doesn't meet requirements, manual implementation is needed: ```cpp #include @@ -875,7 +875,7 @@ struct Task { }; ``` -For C++20, we can implement a simple helper ourselves: +For C++20, you can implement simple helpers yourself: ```cpp // C++20比较合成助手 @@ -923,41 +923,65 @@ struct Task { ``` ------ -**Note**: C++23 provides more powerful comparison synthesis tools, such as `std::compare_three_way` and `std::compare_*_result`. Please refer to the latest standard library documentation when using them. +**Note**: C++23 offers more powerful comparison synthesis tools, such as `std::compare_three_way` and `std::compare_*_result`. Please refer to the latest standard library documentation when using them. ------ ## Common Pitfalls -### Pitfall 1: Forgetting to Explicitly Define == +### Pitfall 1: Default == Does Not Reverse Generate <=> (Generation is One-Way) -In C++20, `<=>` does not automatically generate the `==` operator; we must define it explicitly: +A widespread but outdated claim is: "Writing only `<=>` without `==` causes a compilation error." This was briefly true in early C++20 drafts, but was later fixed by **P1185 (Consistent defaulted comparisons, adopted as a C++20 Defect Report)**—the generation relationship between `<=>` and `==` is **one-way**: + +- default `<=>` → The compiler conveniently generates `==`, `!=`, `<`, `>`, `<=`, and `>=`. So writing only `<=>` is fully sufficient; `==` comes "free". +- Conversely, default `==` → Only generates `==` and `!=`, it will not reverse-generate `<=>` or any relational operators. + +The real pitfall is the latter: You think "I only care about equality, defaulting a `==` is enough," but then someone writes a `a < b` expression, and the compilation blows up—because `==` doesn't come with relational operators. ```cpp -// ❌ 错误:只有<=>,没有== -struct Bad { +#include +#include + +// ✅ 只 default <=>:== 和 < 都自动有了(旧说法里那个「编译错误」其实是错的) +struct HasSpaceship { int value; - auto operator<=>(const Bad&) const = default; - // 缺少 bool operator==(const Bad&) const = default; + auto operator<=>(const HasSpaceship&) const = default; }; -Bad b1{1}, b2{1}; -// bool eq = (b1 == b2); // 编译错误! - -// ✅ 正确:同时定义<=>和== -struct Good { +// ⚠️ 只 default ==:判等没问题,但拿不到 < / <=> +struct HasEquality { int value; - auto operator<=>(const Good&) const = default; - bool operator==(const Good&) const = default; + bool operator==(const HasEquality&) const = default; }; -Good g1{1}, g2{1}; -bool eq = (g1 == g2); // OK +int main() { + HasSpaceship a{1}, b{2}; + std::cout << (a == b) << (a < b) << '\n'; // OK:<=> 把 == 和 < 都生成出来了 + + HasEquality c{1}, d{2}; + std::cout << (c == d) << '\n'; // OK:显式 default 了 == + // std::cout << (c < d) << '\n'; // 编译错误:default == 不反向生成 <=> +} ``` +Tested (Arch Linux WSL, `-std=c++20`; g++ 16.1.1 and clang++ 22.1.6 behavior consistent): + +```text +$ g++ -std=c++20 gotcha.cpp -o gotcha && ./gotcha +01 +0 +$ g++ -std=c++20 -DTRY_LT gotcha.cpp +gotcha.cpp: In function 'int main()': +gotcha.cpp:23:21: error: no match for 'operator<' (operand types are 'HasEquality' and 'HasEquality') + 23 | std::cout << (c < d) << '\n'; + | ~ ^ ~ +``` + +A one-sentence mnemonic: `<=>` is "upstream", `==` is "downstream"—upstream sends all operators downstream, while downstream only minds its own business. As long as you want any kind of magnitude comparison, you need `<=>`; only defaulting `==` will never get you `<=>`. See cppreference section "[Default comparisons](https://en.cppreference.com/mwiki/index.php?title=cpp/language/default_comparisons)". + ### Pitfall 2: Inconsistent Comparison Categories -When implementing manually, the returned comparison categories must be consistent: +When implementing manually, ensure the returned comparison categories are consistent: ```cpp // ❌ 错误:混合不同的比较类别 @@ -1003,9 +1027,9 @@ struct BetterCompare { }; ``` -### Pitfall 3: Comparisons in Inheritance Hierarchies +### Pitfall 3: Comparison in Inheritance Hierarchies -Using `= default` in an inheritance hierarchy requires care: +Using `= default` in inheritance hierarchies requires caution: ```cpp struct Base { @@ -1035,7 +1059,7 @@ DerivedWithNew d2{1, 2}; ### Pitfall 4: The Floating-Point NaN Problem -`NaN` (Not a Number) in floating-point numbers causes the comparison result to be `unordered`: +Floating-point `NaN` (Not a Number) causes comparison results to be `unordered`: ```cpp #include @@ -1068,7 +1092,7 @@ struct SafeFloat { ### Pitfall 5: Compiler Support Issues -The three-way comparison operator requires a relatively recent compiler: +The three-way comparison operator requires a relatively new compiler: ```cpp // 检查编译器支持 @@ -1089,7 +1113,7 @@ The three-way comparison operator requires a relatively recent compiler: #endif ``` -For projects that need to support older compilers, we can use macros for conditional compilation: +For projects needing to support older compilers, you can use macros for conditional compilation: ```cpp #if __cpp_spaceship // 或者 __cplusplus >= 202002L @@ -1133,7 +1157,7 @@ For projects that need to support older compilers, we can use macros for conditi ------ -## Related C++20 Updates +## C++20 Related Updates ### Rewriting Common Comparison Operators @@ -1197,7 +1221,7 @@ void algorithm_example() { ### Key Types for Associative Containers -The default-generated `<=>` enables a type to be used as a key in associative containers: +The default generated `<=>` allows types to be used as keys in associative containers: ```cpp #include @@ -1224,38 +1248,38 @@ keys.insert({"Network", "IP"}); ``` ------ -**Note**: Before C++20, associative containers used `std::less` (which requires `operator<`). C++20 introduced `std::compare_three_way`, which can use `<=>` for comparisons. However, for compatibility, most implementations still use `operator<`. +**Note**: Prior to C++20, associative containers used `std::less` (requiring `operator<`). C++20 introduced `std::compare_three_way`, which can use `<=>` for comparison. However, for compatibility, most implementations still use `operator<`. ------ ## Run Online -Experience the C++20 three-way comparison operator's default generation, custom version number comparison, and partial_ordering online: +Experience C++20's three-way comparison operator default generation, custom version comparison, and partial_ordering online: -Looking back, we can see that the three-way comparison operator is an important feature introduced in C++20, drastically simplifying comparison logic for custom types: +Let's look back one more time: The three-way comparison operator is an important feature introduced in C++20 that significantly simplifies comparison logic for custom types: **Core Concepts**: | Concept | Description | -|---------|-------------| -| `<=>` operator | Three-way comparison operator; a single definition automatically generates all six comparison operators | -| Comparison categories | `strong_ordering`, `weak_ordering`, `partial_ordering` | -| `= default` | Lets the compiler automatically generate comparison logic | -| Comparison order | Defaults to lexicographical comparison in the order of member declaration | +|-----|------| +| `<=>` operator | Three-way comparison operator, defines once to auto-generate all six comparison operators | +| Comparison Categories | `strong_ordering`, `weak_ordering`, `partial_ordering` | +| `= default` | Let the compiler automatically generate comparison logic | +| Comparison Order | Default lexicographical comparison by member declaration order | -**Choosing a Comparison Category**: +**Comparison Category Selection**: | Category | Characteristics | Use Cases | -|----------|-----------------|-----------| +|-----|------|---------| | `strong_ordering` | Equivalence implies equality | Integers, enums, simple value types | -| `weak_ordering` | Equivalent but not equal | Case-insensitive strings, comparisons ignoring certain fields | +| `weak_ordering` | Equivalence does not imply equality | Case-insensitive strings, comparisons ignoring fields | | `partial_ordering` | Possibly incomparable | Floating-point numbers (NaN) | -The three-way comparison operator makes C++ comparison logic more concise and safe. Combined with features we've covered earlier like auto, structured bindings, and attributes, modern C++ has evolved into a powerful and expressive systems programming language. In embedded development, using these features judiciously makes code clearer and easier to maintain. +The three-way comparison operator makes C++ comparison logic more concise and safe. Combined with previously learned features like auto, structured binding, and attributes, modern C++ has evolved into a powerful and expressive system programming language. In embedded development, rational use of these features can make code clearer and easier to maintain. diff --git a/documents/en/vol4-advanced/index.md b/documents/en/vol4-advanced/index.md index 5871d23e7..51f071639 100644 --- a/documents/en/vol4-advanced/index.md +++ b/documents/en/vol4-advanced/index.md @@ -1,6 +1,6 @@ --- -title: 'Part IV: Advanced Topics' -description: Advanced C++20-26 Features +title: 'Volume 4: Advanced Topics' +description: C++20-26 Advanced Features platform: host tags: - cpp-modern @@ -8,20 +8,20 @@ tags: - intermediate translation: source: documents/vol4-advanced/index.md - source_hash: 8022266e678a2c17a0160a8e06e057d3ad256b1acb4ad829063ec0ee0b400fc0 - translated_at: '2026-05-26T11:38:54.974755+00:00' + source_hash: 876294454f59cf37104575dc6b93a3e3e8d5e98d545cd7c3a6b1aa1cc7fec775 + translated_at: '2026-06-13T11:50:46.746327+00:00' engine: anthropic - token_count: 229 + token_count: 247 --- # Volume 4: Advanced Topics -> Status: Partial content available (pending rewrite) +> Status: Partial content available (to be rewritten) ## Overview This volume covers advanced C++20/23/26 features. -## Existing Articles (Pending Rewrite to Generic Content) +## Existing Articles (to be rewritten to generic content) ### Template Programming (Categorized by C++ Standard) @@ -42,6 +42,7 @@ This volume covers advanced C++20/23/26 features. ### Other - Spaceship Operator + Three-Way Comparison Operator + Empty Base Optimization (EBO) C++ Modules (MSVC) diff --git a/documents/en/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md b/documents/en/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md index 3b36fe190..e85061c97 100644 --- a/documents/en/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md +++ b/documents/en/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md @@ -1,7 +1,7 @@ --- -title: Atomic Operation Patterns -description: Correct implementation of classic atomic patterns such as SeqLock, double-checked - locking, reference counting, and publish-subscribe. +title: Atomic Operation Memory Order +description: Correct implementation of classic atomic patterns such as SeqLock, Double-Checked + Locking, reference counting, and publish-subscribe. chapter: 3 order: 5 tags: @@ -25,693 +25,396 @@ related: - 无锁编程基础 translation: source: documents/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md - source_hash: a24002aa66dbcff3a2a2245c22295dbf3dad35b01877a7a1b01489a21a372f16 - translated_at: '2026-05-20T04:39:53.283588+00:00' + source_hash: 6a16ee5ae8b32d406353bc2afbd7dc091077f2bcf1b3ac8dbdc6599198b87cc4 + translated_at: '2026-06-13T11:51:22.489438+00:00' engine: anthropic - token_count: 5359 + token_count: 5394 --- # Atomic Operation Patterns -By this point, we have thoroughly broken down the `std::atomic` operation set, the six memory orders, fences and barriers, `std::atomic_ref`, and `std::atomic_wait`. But taken in isolation, these tools only answer the question of "how"—how to perform an atomic addition, how to issue a release store, or how to wait for a value to change. Real-world engineering practice demands patterns: given a specific concurrency problem, which atomic operations should we choose, and what memory order combinations should we use, to solve the problem both correctly and efficiently? +> 📖 **Application Scenario**: The atomic patterns in this chapter have a high-frequency application in embedded systems—sharing variables between an ISR and the main loop without locks. If you are writing MCU firmware, reading this alongside [Volume 8: Interrupt-Safe Programming](../../vol8-domains/embedded/05-interrupt-safe-coding.md) will provide even greater clarity. -In this chapter, we focus on several of the most classic atomic operation patterns. These patterns were not invented in a vacuum—they come from battle-tested solutions in real systems like the Linux kernel, database engines, and high-performance networking frameworks. We will break down the "why" behind each pattern: why it is designed this way, why the memory order cannot be weaker, and why a seemingly harmless change can introduce a bug. +By this point, we have fully decomposed the `std::atomic` operation set, the six memory orders, fences and barriers, `std::atomic_thread_fence`, and `std::atomic_signal_fence`. However, taking these tools in isolation only answers the question of "how"—how to perform an atomic addition, how to issue a release store, or how to wait for a value to change. Real-world engineering practice requires patterns: when facing a specific concurrency problem, which atomic operations should we choose, and how should we combine their memory orders to solve the problem correctly and efficiently? -The patterns we cover include: SeqLock, Double-Checked Locking, reference counting, publish-subscribe flags, lock-free max/min tracking, stop flags, and spinlocks. Each pattern comes with complete code and a step-by-step semantic analysis. +In this chapter, we focus on several classic atomic operation patterns. These patterns were not invented in a vacuum—they come from proven solutions repeatedly verified in real-world systems like the Linux kernel, database engines, and high-performance network frameworks. We will break down the "why" for each pattern: why it is designed this way, why the memory order cannot be weaker, and why a seemingly harmless change might introduce a bug. -## SeqLock: Sequence Locking Without Blocking Readers +The patterns we cover include: SeqLock, Double-Checked Locking, reference counting, publish-subscribe flags, lock-free min/max tracking, stop flags, and spinlocks. Each pattern is accompanied by complete code and step-by-step semantic analysis. + +## SeqLock: Sequence Locking Where Readers Are Never Blocked ### Pattern Motivation -A classic solution to the reader-writer problem is the reader-writer lock, but its cost is high—even when there are only read operations, it requires the full flow of `lock()` / `unlock()`, involving atomic operations or even system calls. In many scenarios, reads vastly outnumber writes (such as sensor data collection and reading, or fetching system time), and we want read operations to be as lightweight as possible—ideally, completely lock-free. +A classic solution to the reader-writer problem is the reader-writer lock, but its cost is high—even if there are only read operations, it requires the full overhead of a lock/unlock flow, involving atomic operations or even system calls. In many scenarios, the read frequency is far higher than the write frequency (e.g., sensor data collection and reading, or system time retrieval). We want read operations to be as lightweight as possible—ideally, completely lock-free. -SeqLock is designed for exactly this. Its core idea is to use a spinlock to protect the writer (only one writer at a time), but it does not block readers at all—readers check a sequence number to determine whether the data they read is consistent. If the sequence number changes during a read (indicating a writer modified the data), the reader simply retries. +SeqLock is designed for exactly this. Its core idea is: use a spinlock to protect writers (only one writer at a time), but do not block readers at all—readers determine if the data they read is consistent by checking a sequence number. If the sequence number changes during the read (indicating a writer modified the data), the reader simply retries. ### Implementation ```cpp #include -#include -#include -class SeqLock { -public: - SeqLock() : sequence_(0) {} - - /// 写入者:获取写入权限 - void lock_write() - { - unsigned seq = sequence_.load(std::memory_order_relaxed); - // 如果序列号是奇数,说明已经有写入者在工作 - if ((seq & 1u) != 0) { - // 多写入者场景需要自旋等待或用额外的 mutex - // 这里假设只有一个写入者 - return; - } - // 序列号加 1,变成奇数——标记"正在写入" - sequence_.store(seq + 1, std::memory_order_release); - } +struct SeqLock { + std::atomic seq_{0}; // Sequence number + // ... shared data ... - /// 写入者:释放写入权限 - void unlock_write() - { - unsigned seq = sequence_.load(std::memory_order_relaxed); - // 序列号再加 1,变回偶数——标记"写入完成" - sequence_.store(seq + 1, std::memory_order_release); - } + void write(const Data& new_data) { + // 1. Increment sequence to odd (write start) + seq_.fetch_add(1, std::memory_order_acquire); - /// 读取者:在稳定状态下读取数据 - /// 返回读取开始时的序列号;调用者需要在读取后验证序列号是否变化 - unsigned read_begin() const - { - unsigned seq; - for (;;) { - seq = sequence_.load(std::memory_order_acquire); - if ((seq & 1u) == 0) { - // 偶数:没有写入者正在工作 - break; - } - // 奇数:有写入者正在工作,自旋等待 - // 实际实现中可以用 pause/yield 减少功耗 - } - return seq; - } + // 2. Modify shared data + data_ = new_data; - /// 读取者:验证读取期间是否有写入发生 - /// 如果返回 true,说明读取是有效的 - bool read_validate(unsigned seq_before) const - { - unsigned seq_after = sequence_.load(std::memory_order_acquire); - return (seq_after == seq_before) && ((seq_after & 1u) == 0); + // 3. Increment sequence to even (write complete) + seq_.fetch_add(1, std::memory_order_release); } -private: - std::atomic sequence_; + Data read() { + Data copy; + uint32_t seq0, seq1; + do { + seq0 = seq_.load(std::memory_order_acquire); + // Copy data + copy = data_; + seq1 = seq_.load(std::memory_order_acquire); + } while (seq0 != seq1 || (seq0 & 1)); // Retry if changed or odd + return copy; + } }; ``` Let's break down the core mechanism of this design. -The parity of the sequence number is key. An even number means "no writer is currently active, and the data is in a consistent state"; an odd number means "a writer is modifying data, and the current state may be inconsistent." The writer changes the sequence number from even to odd at the start, and back to even when finished—each successful write increments the sequence number by two. +The parity of the sequence number is key. An even number means "no writer is currently active, data is in a consistent state"; an odd number means "a writer is modifying data, state may be inconsistent." The writer changes the sequence from even to odd at the start, and back to even upon completion—each successful write increments the sequence by 2. -The reader's strategy is "pre-read check + post-read verification": first read the sequence number and confirm it is even (no writer active), then read the actual data, and finally read the sequence number again. If the sequence numbers before and after are the same and both are even, it means no writer intervened during the read, and the data is consistent. If they differ (or if it became odd), it means a write occurred during the read, and the data may be inconsistent—the reader simply discards the result and retries. +The reader's strategy is "check-before-read + verify-after-read": first read the sequence number to confirm it is even (no writer), then read the actual data, and finally read the sequence number again. If the sequence numbers are identical and even, it means no writer intervened during the read, and the data is consistent. If they differ (or became odd), it means a write occurred during the read, and the data may be inconsistent—the reader discards this result and retries. -The `release` in ``memory_order_release`` and the `acquire` in ``memory_order_acquire`` / ``read_validate()`` establish a happens-before relationship: all of the writer's modifications to the actual data complete before ``sequence_`` changes back to even (release guarantees prior writes are not reordered after the store); the reader sees the data only after ``sequence_`` becomes even (acquire guarantees subsequent reads are not reordered before the load). This ensures that the data read by the reader is strictly the version fully written by the writer. +The `fetch_add` with `memory_order_acquire` in `write` and the `load` with `memory_order_acquire` in `read` establish a happens-before relationship: all modifications by the writer to the actual data complete before the sequence number changes back to even (release ensures previous writes are not reordered after the store); the reader sees the data only after the sequence number becomes even (acquire ensures subsequent reads are not reordered before the load). This ensures the reader sees a version of the data that is fully written by the writer. ### Usage Example ```cpp -struct SensorData { - double temperature; - double humidity; - double pressure; -}; - -SensorData g_sensor_data; -SeqLock g_seq_lock; - -// 写入者线程(通常是传感器采集线程) -void writer_thread() -{ - for (int i = 0; i < 100; ++i) { - g_seq_lock.lock_write(); - - g_sensor_data.temperature = 20.0 + i * 0.1; - g_sensor_data.humidity = 50.0 + i * 0.2; - g_sensor_data.pressure = 1013.25 + i * 0.01; +// Reader +auto snapshot = lock.read(); // Returns a copy +process(snapshot); - g_seq_lock.unlock_write(); - } -} - -// 读取者线程(可以有多个) -void reader_thread(int id) -{ - for (int i = 0; i < 100; ++i) { - SensorData local; - unsigned seq; - - do { - seq = g_seq_lock.read_begin(); - local = g_sensor_data; // 拷贝数据 - } while (!g_seq_lock.read_validate(seq)); - - // 现在可以安全地使用 local——它是一个一致的快照 - std::cout << "Reader " << id << ": temp=" << local.temperature - << " humidity=" << local.humidity - << " pressure=" << local.pressure << "\n"; - } -} +// Writer +lock.write(new_data); ``` -Note that the reader copies the data into a ``local`` variable before verifying. This is a crucial detail—if we use the data directly without copying, it is already "dirty" when verification fails, and we can neither use it nor retry. SeqLock readers must be prepared to discard read results at any time, so the read data must either be read-only (use and discard) or copied out before use. +Note that the reader copies the data to a local variable before verifying. This is a critical detail—if we use the data directly without copying, and the verification fails, the data is already "dirty" and cannot be used or retried. SeqLock readers must be prepared to discard results at any time, so read data must either be read-only (use and discard) or copied before use. ### Applicability Boundaries of SeqLock -There are a few limitations of SeqLock we need to clearly understand. First, it assumes at most one writer—if multiple writers are needed, a mutex must be wrapped around the outside. Second, the data type being read must be trivially copyable—if the data contains pointers or complex objects, encountering a partially modified state during the copy could lead to undefined behavior. Third, if writes are very frequent, readers may retry repeatedly, and performance could actually be worse than a reader-writer lock—SeqLock is suited for "few writes, many reads" scenarios. The ``seqlock_t`` in the Linux kernel is a classic implementation of this pattern, used for scenarios like time fetching (``do_gettimeofday``). +There are a few limitations of SeqLock to be aware of. First, it assumes at most one writer—if you need multiple writers, you must wrap it in an outer mutex. Second, the data type read must be trivially copyable—if the data contains pointers or complex objects, encountering a partially modified state during copying could lead to undefined behavior. Third, if writes are very frequent, readers may retry repeatedly, potentially performing worse than a reader-writer lock—SeqLock is suitable for "write-rarely, read-frequently" scenarios. The `seqlock_t` in the Linux kernel is a classic implementation of this pattern, used for time retrieval (`gettimeofday`) and similar scenarios. ## Double-Checked Locking: Finally Correct Since C++11 ### Pattern Motivation and Historical Baggage -The Double-Checked Locking Pattern (DCLP) is arguably one of the most discussed patterns in multithreaded programming—not because it is the best pattern, but because it was simply impossible to implement correctly before C++11. In their 2004 paper "C++ and the Perils of Double-Checked Locking," Scott Meyers and Andrei Alexandrescu analyzed in detail why it fails under the old standard. There are two core reasons: compilers can reorder memory operations (writing an object's fields might be reordered after publishing the pointer), and the CPU itself might also reorder (relatively restricted on x86, but very aggressive on ARM/PowerPC). +The Double-Checked Locking Pattern (DCLP) is perhaps one of the most discussed patterns in multithreaded programming—not because it is the best pattern, but because it could not be implemented correctly prior to C++11. In their 2004 paper "C++ and the Perils of Double-Checked Locking," Scott Meyers and Andrei Alexandrescu analyzed in detail why it failed under the old standard. The core reasons were twofold: compilers could reorder memory operations (writes to an object's fields could be reordered after publishing the pointer), and the CPU itself could also reorder (relatively constrained on x86, but very aggressive on ARM/PowerPC). -The formal memory model and ``std::atomic`` introduced in C++11 finally gave DCLP a portable, correct implementation. +The formal memory model and `std::atomic` introduced in C++11 finally provided a portable, correct implementation for DCLP. ### Correct DCLP Implementation ```cpp -#include -#include -#include - class Singleton { + static std::atomic inst_; + static std::mutex mtx_; + + Singleton() = default; public: - static Singleton& instance() - { - Singleton* ptr = instance_.load(std::memory_order_acquire); - if (ptr == nullptr) { - std::lock_guard lock(mutex_); - ptr = instance_.load(std::memory_order_relaxed); - if (ptr == nullptr) { - ptr = new Singleton(); - instance_.store(ptr, std::memory_order_release); + static Singleton* get_instance() { + Singleton* ptr = inst_.load(std::memory_order_acquire); + if (ptr == nullptr) { // 1st check + std::lock_guard lock(mtx_); + if (ptr == nullptr) { // 2nd check + ptr = new Singleton; + // Publish with release + inst_.store(ptr, std::memory_order_release); } } - return *ptr; - } - - void do_something() - { - std::cout << "Singleton::do_something()\n"; + return ptr; } - -private: - Singleton() = default; - Singleton(const Singleton&) = delete; - Singleton& operator=(const Singleton&) = delete; - - static std::atomic instance_; - static std::mutex mutex_; }; - -std::atomic Singleton::instance_{nullptr}; -std::mutex Singleton::mutex_; ``` Let's break down the role of each check in this implementation. -The first check, ``instance_.load(acquire)``, happens outside the lock—if the instance is already created (the vast majority of calls take this path), it returns the pointer directly without needing to lock. ``memory_order_acquire`` guarantees that subsequent accesses to the ``Singleton`` object's members through this pointer will definitely see the values initialized in the constructor. This is why this load cannot use ``relaxed``—``relaxed`` does not establish a happens-before relationship, and we might see an object that has been allocated but not yet fully constructed. +The first check `ptr == nullptr` is performed outside the lock—if the instance is already created (the vast majority of calls take this path), it returns the pointer directly without locking. `memory_order_acquire` ensures that subsequent access to the `Singleton` object's members via this pointer will definitely see the values initialized in the constructor. This is why this load cannot use `memory_order_relaxed`—`relaxed` does not establish a happens-before relationship, and we might see an object for which memory has been allocated but construction has not finished. -The second check, ``instance_.load(relaxed)``, happens inside the lock—at this point we hold the mutex, so no other thread can be creating the instance simultaneously, making ``relaxed`` sufficient. If you feel uneasy about ``relaxed``, switching to ``acquire`` would not cause correctness issues; it just theoretically adds an unnecessary barrier. +The second check `ptr == nullptr` is performed inside the lock—at this point we hold the mutex, so no other thread can be creating the instance simultaneously, so `relaxed` is sufficient. If you feel `relaxed` looks unsafe, switching to `acquire` would not be a correctness issue, just theoretically adding an unnecessary barrier. -The ``release`` semantics in ``instance_.store(ptr, release)`` are key: it guarantees that ``new Singleton()`` (including all initialization operations in the constructor) completes before the store. Combined with the ``acquire`` load in the first check, this establishes a complete release-acquire synchronization pair: all writes from the constructor happen-before the store, the store happens-before another thread's acquire load, and the acquire load happens-before that thread's access to the Singleton's members. The chain is complete, with no gaps. +The `memory_order_release` semantics in `inst_.store` are key: it guarantees that the initialization of `*ptr` (including all initialization operations in the constructor) completes before the store. Combined with the `acquire` load in the first check, a complete release-acquire synchronization pair is established: all writes from the constructor happen-before the store, the store happens-before the acquire load of another thread, and the acquire load happens-before that thread's access to the Singleton members. The chain is complete with no gaps. -### Why Not Just Use Meyers' Singleton +### Not Just Using Meyers' Singleton -C++11 guarantees that the initialization of ``static`` local variables inside a function is thread-safe. So the simplest singleton pattern is actually: +C++11 guarantees that the initialization of `static` local variables within a function is thread-safe. Therefore, the simplest Singleton pattern is actually: ```cpp class Singleton { public: - static Singleton& instance() - { + static Singleton& get_instance() { static Singleton inst; return inst; } -private: - Singleton() = default; }; ``` -This code is completely correct, and compilers typically implement it internally using ``std::call_once`` or equivalent atomic operations. So what is the point of DCLP? +This code is entirely correct, and compilers usually implement it internally using `std::atomic` or equivalent atomic operations. So, is DCLP still useful? -First, the idea behind DCLP is not limited to singletons—any "check-lock-recheck-initialize" pattern can use this approach. Examples include lazily initializing a large object, allocating thread-local storage on demand, or lazily loading a configuration file. Second, in some extreme performance scenarios, the first check of DCLP generates lighter code than the ``static`` local variable—the latter usually requires checking a hidden ``std::once_flag``, and the implementation of this flag might be heavier than a single ``atomic load``. +First, the idea of DCLP is not limited to Singletons—any "check-lock-check-initialize" pattern can use this logic. Examples include lazy initialization of a large object, on-demand allocation of thread-local storage, or lazy loading of configuration files. Second, in some extreme performance scenarios, the first check of DCLP generates lighter code than the `static` local variable—the latter usually requires checking a hidden guard flag, and the implementation of that flag might be heavier than a single atomic load. ## Reference Counting: The Atomic Foundation of shared_ptr -### Atomic Requirements of Reference Counting +### Atomic Requirements for Reference Counting -Reference counting is another ubiquitous atomic pattern. The control block inside ``std::shared_ptr`` contains a reference count and a weak reference count, both of which are atomic variables. Let's look at a simplified reference-counted pointer to understand what atomic operations it needs: +Reference counting is another ubiquitous atomic pattern. The control block of `std::shared_ptr` contains a reference count and a weak reference count, both of which are atomic variables. Let's look at a simplified reference-counted pointer to understand which atomic operations it needs: ```cpp -#include -#include - template -class IntrusivePtr { -public: - IntrusivePtr() : ptr_(nullptr) {} - - explicit IntrusivePtr(T* ptr) : ptr_(ptr) - { - if (ptr_) { - ptr_->add_ref(); - } - } - - IntrusivePtr(const IntrusivePtr& other) : ptr_(other.ptr_) - { - if (ptr_) { - ptr_->add_ref(); - } - } - - IntrusivePtr(IntrusivePtr&& other) noexcept : ptr_(other.ptr_) - { - other.ptr_ = nullptr; - } - - IntrusivePtr& operator=(const IntrusivePtr& other) - { - if (this != &other) { - release(); - ptr_ = other.ptr_; - if (ptr_) { - ptr_->add_ref(); - } - } - return *this; - } - - IntrusivePtr& operator=(IntrusivePtr&& other) noexcept - { - if (this != &other) { - release(); - ptr_ = other.ptr_; - other.ptr_ = nullptr; - } - return *this; - } +class RefCountedPtr { + struct ControlBlock { + std::atomic ref_count{1}; + T* ptr; + // ... weak_count, etc. + }; + ControlBlock* ctrl; - ~IntrusivePtr() - { - release(); + void add_ref() { + // Just atomic increment, no synchronization needed + ctrl->ref_count.fetch_add(1, std::memory_order_relaxed); } - T& operator*() const { return *ptr_; } - T* operator->() const { return ptr_; } - T* get() const { return ptr_; } - -private: - void release() - { - if (ptr_ && ptr_->release_ref()) { - delete ptr_; + void release() { + // fetch_add returns the old value + if (ctrl->ref_count.fetch_sub(1, std::memory_order_acq_rel) == 1) { + // Acquire ensures we see all writes to the object + delete ctrl->ptr; + delete ctrl; } - ptr_ = nullptr; - } - - T* ptr_; -}; - -/// 基类:提供侵入式引用计数 -class RefCounted { -public: - RefCounted() : ref_count_(1) {} - virtual ~RefCounted() = default; - - void add_ref() - { - ref_count_.fetch_add(1, std::memory_order_relaxed); } - - /// 返回 true 表示引用计数归零,应该销毁对象 - bool release_ref() - { - // acquire 保证在引用计数归零后,能看到所有之前 add_ref 的线程 - // 对对象的全部修改——确保析构时对象状态一致 - return ref_count_.fetch_sub(1, std::memory_order_acq_rel) == 1; - } - -private: - std::atomic ref_count_; }; ``` -There are two key points regarding the atomic operations in reference counting. ``add_ref()`` uses ``memory_order_relaxed``—incrementing the reference count does not need to synchronize with other operations; we only care about the atomicity of the count itself. Even if thread A's ``add_ref`` and thread B's ``release_ref`` race, ``fetch_add`` and ``fetch_sub`` are themselves atomic and will not cause count errors. +There are two key points regarding atomic operations in reference counting. `add_ref` uses `memory_order_relaxed`—incrementing the reference count does not need to synchronize with other operations; we only care about the atomicity of the count itself. Even if thread A's `add_ref` races with thread B's `release`, the `fetch_add` and `fetch_sub` themselves are atomic and will not cause counting errors. -Using ``memory_order_acq_rel`` for ``release_ref()`` is a more nuanced choice. ``acquire`` semantics guarantee that when the reference count reaches zero, the current thread can see all modifications made to the object by other threads prior to that point (because every object access after a ``add_ref`` implicitly carries a "holding a reference" relationship). ``release`` semantics guarantee that before destructing the object, all accesses to the object by the current thread have completed. Together, these two directions ensure the safety of destruction—the destructor sees a fully consistent object state, with no other threads still accessing the object. +`release` using `memory_order_acq_rel` is a more nuanced choice. The `acquire` semantics guarantee that when the reference count reaches zero, the current thread sees all modifications to the object by other threads prior to that point (because every object access after a `add_ref` implies a "holding a reference" relationship). The `release` semantics guarantee that all accesses by the current thread to the object complete before destruction. Together, these two directions ensure the safety of destruction—the destructor sees a fully consistent object state, and no other thread is still accessing the object. ## Publish-Subscribe Flag: Relaxed Counter + Acquire-Release Flag ### Pattern Description -This is a highly practical combination pattern: a ``relaxed`` atomic counter for statistics (not requiring precise synchronization), plus an ``acquire-release`` atomic flag for notification. A typical scenario is a task queue—worker threads fetch and execute tasks from the queue, increment the counter after completing each task, and set the flag to notify the main thread when all are done. +This is a very practical combined pattern: a `relaxed` atomic counter for statistics (no precise synchronization needed) plus an `acquire-release` atomic flag for notification. A typical scenario is a task queue—worker threads take tasks from a queue to execute, increment a counter after completing each task, and set a flag to notify the main thread when all are done. ```cpp -#include -#include -#include -#include - -std::atomic tasks_completed{0}; -std::atomic all_done{false}; - -void worker(int num_tasks) -{ - for (int i = 0; i < num_tasks; ++i) { - // 模拟任务处理 - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - tasks_completed.fetch_add(1, std::memory_order_relaxed); - } -} - -int main() -{ - constexpr int kNumWorkers = 4; - constexpr int kTasksPerWorker = 25; - constexpr int kTotalTasks = kNumWorkers * kTasksPerWorker; - - std::vector threads; - for (int i = 0; i < kNumWorkers; ++i) { - threads.emplace_back(worker, kTasksPerWorker); - } - - // 主线程等待所有任务完成 - while (!all_done.load(std::memory_order_acquire)) { - std::cout << "Progress: " << tasks_completed.load(std::memory_order_relaxed) - << "/" << kTotalTasks << "\n"; - if (tasks_completed.load(std::memory_order_relaxed) >= kTotalTasks) { - all_done.store(true, std::memory_order_release); +struct ProgressTracker { + std::atomic completed{0}; // Relaxed + std::atomic done{false}; // Acquire/Release + + void worker_complete() { + completed.fetch_add(1, std::memory_order_relaxed); + if (is_all_done()) { + done.store(true, std::memory_order_release); } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); } - for (auto& t : threads) { - t.join(); + void main_wait() { + while (!done.load(std::memory_order_acquire)) { + // spin or wait + } + // Now safe to read 'completed' + print_stats(completed.load(std::memory_order_relaxed)); } - std::cout << "All " << kTotalTasks << " tasks completed!\n"; - return 0; -} +}; ``` -The key to this pattern is the separation of concerns. ``tasks_completed`` is only used for displaying progress—it does not need precise synchronization, so ``memory_order_relaxed`` is sufficient. Even if the main thread occasionally reads a "stale" count (off by one or two), it has no impact on user experience. ``all_done`` is the true synchronization point—it uses ``acquire-release`` to guarantee that when the main thread sees ``all_done == true``, all modifications to shared data by worker threads are already visible. +The key to this pattern is the separation of concerns. `completed` is only for displaying progress—it doesn't need precise synchronization, so `relaxed` is enough. Even if the main thread occasionally reads an "old" count (off by 1 or 2), it has no impact on user experience. `done` is the true synchronization point—it uses `acquire-release` to guarantee that when the main thread sees `done == true`, all modifications to shared data by worker threads are visible. -This combination of "relaxed statistics + strict synchronization" is very common in engineering. As another example: a network server uses a relaxed counter to record the number of processed requests (losing an update or two occasionally does not matter), and uses an acquire-release flag to notify a shutdown signal (which must guarantee all request processing is complete before shutting down). +This combination of "relaxed statistics + strict synchronization" is very common in engineering. Another example: a network server uses a relaxed counter to record processed requests (losing an occasional update is fine), and an acquire-release flag to signal a shutdown (must guarantee all requests are processed before closing). -## Lock-Free Max/Min Tracking: CAS Loop +## Lock-Free Min/ax Tracking: CAS Loop ### Pattern Description -Maintaining a global maximum or minimum value and updating it lock-free in a multithreaded environment is a classic CAS (compare-and-swap) usage pattern. For example, a network server might want to track the slowest request latency, or a sensor system might want to record extreme temperatures. +Maintaining a global maximum or minimum value and updating it in a lock-free manner in a multithreaded environment is a classic CAS (compare-and-swap) usage pattern. For example, a network server might want to track the slowest request latency, or a sensor system might record extreme temperatures. ```cpp -#include -#include -#include -#include -#include -#include - class MaxTracker { + std::atomic max_val_{0}; public: - explicit MaxTracker(double initial) - : max_value_(initial) - {} - - /// 如果新值大于当前最大值,更新最大值 - void update(double candidate) - { - double current = max_value_.load(std::memory_order_relaxed); + void update(uint64_t candidate) { + uint64_t current = max_val_.load(std::memory_order_relaxed); while (candidate > current) { - if (max_value_.compare_exchange_weak( - current, candidate, - std::memory_order_relaxed, - std::memory_order_relaxed)) { - break; // CAS 成功,更新完成 + // Try to update if current hasn't changed + if (max_val_.compare_exchange_weak(current, candidate, + std::memory_order_relaxed)) { + break; // Success } - // CAS 失败,current 被自动更新为当前值,继续循环 + // Failure: current updated by CAS, loop continues } } - - double get() const - { - return max_value_.load(std::memory_order_relaxed); - } - -private: - std::atomic max_value_; }; - -int main() -{ - MaxTracker tracker(0.0); - constexpr int kNumThreads = 4; - constexpr int kUpdatesPerThread = 100000; - - auto worker = [&](int seed) { - std::mt19937 rng(seed); - std::uniform_real_distribution dist(0.0, 100.0); - for (int i = 0; i < kUpdatesPerThread; ++i) { - tracker.update(dist(rng)); - } - }; - - std::vector threads; - for (int i = 0; i < kNumThreads; ++i) { - threads.emplace_back(worker, i + 42); - } - - for (auto& t : threads) { - t.join(); - } - - std::cout << "Max value tracked: " << tracker.get() << "\n"; - return 0; -} ``` -The CAS loop is the core of this pattern. We first load the current maximum value; if the candidate value is not greater than the current value, we do nothing and return directly. If the candidate value is larger, we try to use CAS to replace the current value with the candidate. CAS might fail—because another thread might have already updated the maximum between our load and CAS. On failure, ``compare_exchange_weak`` updates ``current`` to the latest value, and we recompare to decide whether to try again. +The CAS loop is the core of this pattern. We first load the current maximum. If the candidate value is not greater than the current value, we do nothing and return. If the candidate is larger, we attempt to replace the current value with the candidate using CAS. CAS might fail—because another thread may have updated the maximum between our load and CAS. Upon failure, `compare_exchange_weak` updates `current` to the latest value, and we re-compare to decide if we need to try again. -Using ``compare_exchange_weak`` instead of ``strong`` here is a common optimization—in a loop, the occasional spurious failure of the ``weak`` version just means one extra iteration, but it is more efficient than ``strong`` on certain platforms (especially LL/SC architectures like ARM and PowerPC). +Using `compare_exchange_weak` instead of `compare_exchange_strong` here is a common optimization—in a loop, the occasional spurious failure of the `weak` version just means one extra iteration, but it is more efficient on certain platforms (especially ARM, PowerPC, and other LL/SC architectures) than the `strong` version. -All memory orders use ``relaxed``—because we only care about the correctness of the single variable (the maximum value) itself, and do not need to establish synchronization relationships with other variables. This holds if the max tracking is only used for statistics or monitoring and does not require strict happens-before guarantees. +All memory orders use `relaxed`—because we only care about the correctness of the single variable (the maximum value) itself, and do not need to establish synchronization relationships with other variables. If max tracking is purely for statistics or monitoring, strict happens-before guarantees are not needed. -However, note that CAS operations on ``std::atomic`` are not lock-free on most platforms—because ``double`` is 64-bit, while CAS on some 32-bit platforms can only handle 32 bits. If your target is a 32-bit embedded platform, this pattern might not be as efficient as expected. On 64-bit platforms, 64-bit CAS is usually lock-free. +However, note that the CAS operation for `uint64_t` is not lock-free on most platforms—because `uint64_t` is 64-bit, and CAS on some 32-bit platforms can only handle 32-bit. If your target is a 32-bit embedded platform, this pattern might not be as efficient as expected. On 64-bit platforms, 64-bit CAS is usually lock-free. -## Stop Flag: The Correct Usage of atomic +## Stop Flag: Correct Usage of atomic ### Basic Pattern -The stop flag is perhaps the simplest atomic pattern—a background thread periodically checks the flag, and the main thread sets the flag and waits for the thread to exit. It looks simple, but there are details worth discussing: +The stop flag is likely the simplest atomic pattern—a background thread periodically checks the flag, and the main thread sets the flag and waits for the thread to exit. It looks simple, but there are details worth discussing: ```cpp -#include -#include -#include -#include - -std::atomic should_stop{false}; - -void background_task() -{ - int count = 0; - while (!should_stop.load(std::memory_order_acquire)) { - // 做一些工作 - ++count; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); +std::atomic stop_{false}; + +void worker() { + while (!stop_.load(std::memory_order_acquire)) { + // Do work } - std::cout << "Task stopped after " << count << " iterations\n"; } -int main() -{ - std::thread t(background_task); - - std::this_thread::sleep_for(std::chrono::seconds(2)); - should_stop.store(true, std::memory_order_release); - t.join(); - std::cout << "Main: thread joined\n"; - return 0; +void shutdown() { + // Update shared data... + stop_.store(true, std::memory_order_release); } ``` -Using ``memory_order_acquire`` and ``memory_order_release`` instead of ``relaxed`` here has reasons worth explaining. If the background thread reads some shared data after checking the stop flag (for example, reading the latest configuration after ``sleep_for``), then ``acquire`` guarantees it can see all modifications to the shared data made by the thread setting the flag prior to that point. Similarly, ``release`` guarantees that all writes by the main thread before setting the flag (such as updating the configuration) are visible to the background thread. +Using `acquire` and `release` here instead of `relaxed` is worth explaining. If the background thread reads some shared data after checking the stop flag (e.g., reading the latest config after the loop), `acquire` ensures it sees all modifications to the shared data made by the thread setting the flag prior to that point. Similarly, `release` ensures that all writes by the main thread before setting the flag (like updating config) are visible to the background thread. -If your stop flag is purely a boolean signal—the background thread does not need to read any other shared data—then ``relaxed`` is also safe. But there is no harm in making a habit of using ``acquire/release``, and the performance difference is negligible (on x86, a load is a normal read regardless of memory order; on ARM, an acquire load is just a single ``ldar`` instruction). +If your stop flag is purely a boolean signal—the background thread doesn't need to read any other shared data—then `relaxed` is also safe. But forming the habit of using `acquire-release` does no harm, and the performance difference is negligible (on x86, loads are normal reads regardless of memory order; on ARM, an acquire load is just one extra instruction). -### Implementing Low-Latency Stopping with atomic_wait +### Low-Latency Stopping with atomic_wait -In the previous chapter, we introduced ``std::atomic::wait/notify``. Here we can upgrade the stop flag to a "wait-based stop"—the background thread blocks and waits on the flag instead of polling it: +In the previous chapter, we introduced `atomic_wait`. Here, we can upgrade the stop flag to a "wait-style stop"—the background thread blocks waiting on the flag instead of polling it: ```cpp -#include -#include -#include -#include - -std::atomic should_stop{false}; +void worker() { + while (!stop_.load(std::memory_order_acquire)) { + // Do periodic work + if (need_stop) break; -void waiting_task() -{ - int count = 0; - while (!should_stop.load(std::memory_order_acquire)) { - ++count; - std::cout << "Working... iteration " << count << "\n"; - - // 等待 100ms 或被 notify 唤醒 - should_stop.wait(false, std::memory_order_acquire); + // Wait for signal or timeout + stop_.wait(false); } - std::cout << "Task stopped after " << count << " iterations\n"; } -int main() -{ - std::thread t(waiting_task); - - std::this_thread::sleep_for(std::chrono::seconds(2)); - should_stop.store(true, std::memory_order_release); - should_stop.notify_one(); - - t.join(); - std::cout << "Main: thread joined\n"; - return 0; +void shutdown() { + stop_.store(true, std::memory_order_release); + stop_.notify_one(); } ``` -In this version, ``wait(false)`` blocks while ``should_stop`` is still ``false``, consuming no CPU at all. When the main thread executes ``store(true) + notify_one()``, the background thread is woken up immediately and exits. But there is a problem: ``wait`` has no timeout—if the background thread needs to do some work periodically between ``wait`` calls (such as checking a sensor every 100 ms), a pure ``wait`` is not appropriate. In this case, a hybrid approach combining ``sleep_for`` + ``notify`` is more practical: use ``sleep_for`` for periodic work most of the time, and use ``notify`` to wake the thread when immediate stopping is needed. +In this version, `stop_.wait` blocks while `stop_` is still `false`, consuming no CPU. When the main thread calls `store`, the background thread wakes immediately and exits. However, there is an issue: `wait` has no timeout—if the background thread needs to do work periodically between two `wait` calls (e.g., checking a sensor every 100ms), pure `wait` is not suitable. In this case, a hybrid solution combining `sleep_for` + `wait` is more practical: use `sleep_for` for periodic work most of the time, and use `wait` to wake the thread when immediate stopping is needed. ## Spinlock: Educational Implementation and Applicable Scenarios ### Basic Implementation -The spinlock is the simplest mutual exclusion primitive—a thread that fails to acquire the lock does not block, but repeatedly tries in a tight loop. It is usually not suitable for production environments (we will explain why later), but it is an excellent teaching tool—because it demonstrates the usage of ``atomic_flag`` and the basic principles of lock-free synchronization with the minimum amount of code. +A spinlock is the simplest mutual exclusion primitive—a thread that fails to acquire it doesn't block, but retries in a tight loop. It is generally not suitable for production environments (explained later), but it serves as an excellent educational tool because it demonstrates the usage of `std::atomic` and the basic principles of lock-free synchronization with minimal code. ```cpp -#include -#include -#include - class SpinLock { + std::atomic locked_{false}; public: - SpinLock() : locked_(false) {} - - void lock() - { + void lock() { + // Keep trying until we successfully swap false to true while (locked_.exchange(true, std::memory_order_acquire)) { - // exchange 返回旧值:如果是 true,说明锁已经被占用,继续自旋 - // 如果是 false,说明我们成功获取了锁 + // Spin } } - void unlock() - { + void unlock() { locked_.store(false, std::memory_order_release); } - -private: - std::atomic locked_; }; - -int main() -{ - SpinLock spinlock; - int counter = 0; - - auto work = [&](int times) { - for (int i = 0; i < times; ++i) { - spinlock.lock(); - ++counter; - spinlock.unlock(); - } - }; - - std::thread t1(work, 1000000); - std::thread t2(work, 1000000); - - t1.join(); - t2.join(); - - std::cout << "counter = " << counter << "\n"; // 2000000 - return 0; -} ``` -The ``exchange(true, acquire)`` in ``lock()`` is a clever operation: it atomically sets ``locked_`` to ``true`` while returning the previous value. If the old value is ``false``, it means the lock was not previously held, and we successfully acquired it. If the old value is ``true``, it means the lock is already held by someone else, and we continue looping. ``acquire`` semantics guarantee that operations after acquiring the lock are not reordered before ``exchange``—modifications made by other threads before releasing the lock are visible to the current thread. +The `exchange` in `lock` is a clever operation: it atomically sets `locked_` to `true` while returning the previous value. If the old value is `false`, the lock was free and we successfully acquired it. If the old value is `true`, the lock is already held by someone else, so we continue looping. The `acquire` semantics guarantee that operations after acquiring the lock are not reordered before the `exchange`—modifications by other threads before releasing the lock are visible to the current thread. -The ``release`` semantics in ``unlock()`` guarantee that all writes inside the critical section complete before releasing the lock—the next thread to acquire the lock will see these modifications. +The `release` semantics in `unlock` guarantee that all writes in the critical section complete before releasing the lock—the next thread to acquire the lock will see these modifications. -### Why Spinlocks Are Usually Unsuitable for Production +### Why Spinlocks Are Usually Not Suitable for Production -The biggest problem with spinlocks is that they consume CPU while waiting. If the critical section is very short (a few instructions), the overhead of spin-waiting might be lower than the context switch overhead of a mutex. But if the critical section is slightly longer, or if multiple threads are competing for the same lock, spinlocks cause CPU time to be heavily wasted on "spinning." Even worse, on single-core systems, spinlocks are completely pointless—the thread occupies the CPU while spinning, and the thread holding the lock has no chance to run and release it, resulting in a dead lock. +The biggest problem with spinlocks is that they consume CPU while waiting. If the critical section is very short (a few instructions), the overhead of spinning might be lower than the context switch overhead of a mutex. But if the critical section is slightly longer, or if multiple threads are competing for the same lock, spinlocks lead to a massive waste of CPU time on "spinning." Worse, on single-core systems, spinlocks are completely meaningless—the thread holds the CPU while spinning, so the thread holding the lock never gets a chance to run to release it, resulting in deadlock. -In real projects, prefer using ``std::mutex`` or ``std::shared_mutex``. Only consider a spinlock when all of the following conditions are met simultaneously: the critical section is extremely short (no more than a few dozen instructions), contention is low, and it runs on a multi-core system. The Linux kernel makes extensive use of spinlocks in preemptible kernels—but the kernel has special scheduling guarantees (disabling preemption), which user space does not have. +In actual projects, prioritize `std::mutex` or `std::shared_mutex`. Only consider a spinlock when all the following conditions are met: the critical section is extremely short (no more than a few dozen instructions), contention is low, and it runs on a multi-core system. The Linux kernel uses spinlocks extensively in preemptible kernels—but the kernel has special scheduling guarantees (preemption is disabled), which user-space lacks. -### A Better Version Using atomic_flag +### Better Version Using atomic_flag -The ``SpinLock`` above is implemented using ``std::atomic``, but a more canonical approach is to use ``std::atomic_flag``—it is the only atomic type guaranteed by the standard to be lock-free (``std::atomic`` might theoretically not be lock-free): +The `SpinLock` above uses `std::atomic`, but a more canonical approach uses `std::atomic_flag`—it is the only atomic type guaranteed by the standard to be lock-free (`std::atomic` is theoretically not required to be lock-free): ```cpp -class SpinLockFlag { +class SpinLock { + std::atomic_flag locked_ = ATOMIC_FLAG_INIT; public: - SpinLockFlag() { flag_.clear(); } - - void lock() - { - while (flag_.test_and_set(std::memory_order_acquire)) { - // test_and_set 原子地设置 flag 为 true 并返回旧值 + void lock() { + while (locked_.test_and_set(std::memory_order_acquire)) { + // Spin } } - void unlock() - { - flag_.clear(std::memory_order_release); + void unlock() { + locked_.clear(std::memory_order_release); } - -private: - std::atomic_flag flag_ = ATOMIC_FLAG_INIT; }; ``` -``test_and_set`` and ``clear`` are the two core operations of ``atomic_flag``—the former atomically sets the flag to ``true`` and returns the old value, while the latter atomically sets the flag to ``false``. This version is semantically completely equivalent to the ``atomic`` version, but guarantees lock-free behavior. +`test_and_set` and `clear` are the two core operations of `std::atomic_flag`—the former atomically sets the flag to `true` and returns the old value, the latter atomically sets the flag to `false`. This version is semantically equivalent to the `std::atomic` version but guarantees lock-free behavior. ## Decision Guide for Pattern Selection -Having learned about so many patterns, how do we choose when actually coding? We can make decisions based on the characteristics of the critical section. +With so many patterns understood, how do we choose when coding? We can decide based on the characteristics of the critical section. -If the critical section is just a simple variable read or update—such as a counter, a flag, or a maximum value—directly using ``std::atomic`` RMW operations (``fetch_add``, CAS, etc.) is sufficient. No mutex or spinlock is needed. This is the lightest choice and offers the best performance. The choice of memory order depends on whether synchronization with other variables is needed: if not, ``relaxed`` is fine; if yes, use ``acquire/release``. +If the critical section is just a simple variable read or update—like a counter, a flag, or a maximum value—direct RMW operations on `std::atomic` (`fetch_add`, CAS, etc.) are sufficient. No mutex, no spinlock. This is the lightest choice with the best performance. The choice of memory order depends on whether synchronization with other variables is needed: if not, `relaxed` is fine; if yes, use `acquire-release`. -If the critical section involves coordinated modifications to multiple variables—such as inserting an element into a map while updating a counter—then ``std::atomic`` is no longer enough (unless you can pack multiple variables into a single struct updated via CAS), and you should honestly use a ``std::mutex``. Although a mutex has context switch overhead, it guarantees correctness, and the overhead is very low when contention is low (Linux's ``futex`` completes entirely in user space when uncontended). +If the critical section involves coordinated modification of multiple variables—like inserting an element into a map while updating a counter—`std::atomic` is not enough (unless you can pack multiple variables into a struct updated via CAS), so honestly use a `std::mutex`. Although a mutex has context switch overhead, it guarantees correctness, and overhead is low when contention is low (Linux's `std::mutex` is entirely in user-space when uncontested). -If reads vastly outnumber writes, and the data is trivially copyable—SeqLock is a good choice. It keeps readers completely lock-free, at the cost of only occasional retries. The Linux kernel uses it in many high-frequency read scenarios. +If the read frequency is far higher than the write frequency, and the data is trivially copyable—SeqLock is a good choice. It keeps readers completely lock-free, at the cost of occasional retries. The Linux kernel uses this in many high-frequency read scenarios. -If you need lazy initialization or a "check-lock-recheck" pattern—DCLP has been correct since C++11. But if it is just a singleton, prefer Meyers' Singleton (``static`` local variable), as it is simpler and less error-prone. +If you need lazy initialization or a "check-lock-check" pattern—DCLP has been correct since C++11. But if it's just a Singleton, prioritize Meyers' Singleton (`static` local variable); it is simpler and less error-prone. -If you need to wait for a condition to be met—use ``std::atomic::wait/notify`` instead of busy-waiting or `condition_variable`. It uses futex on Linux, its latency is an order of magnitude lower than `condition_variable`, and it does not require an additional mutex. +If you need to wait for a condition to be met—use `atomic_wait` instead of busy-waiting or `condition_variable`. On Linux, it uses futex, with latency an order of magnitude lower than `condition_variable`, and no extra mutex is needed. ## Summary -In this chapter, we applied all the tools learned in ch03—the ``std::atomic`` operation set, memory orders, fences, ``wait/notify``, and ``atomic_ref``—comprehensively across seven classic concurrency patterns. +In this chapter, we applied all the tools learned in ch03—`std::atomic` operation sets, memory orders, fences, `std::atomic_thread_fence`, and `std::atomic_signal_fence`—to seven classic concurrency patterns. -SeqLock uses the parity of a sequence number to allow readers to lock-free detect write interference, suited for "many reads, few writes, trivially copyable data" scenarios. Double-Checked Locking finally has a correct, portable implementation under the C++11 memory model—the core is the ``acquire`` load and ``release`` store of ``std::atomic``. The reference counting pattern demonstrates the combination of ``fetch_add`` with ``relaxed`` and ``fetch_sub`` with ``acq_rel``—the former only cares about atomicity, while the latter also guarantees visibility at destruction time. The publish-subscribe flag separates relaxed count statistics from strict synchronization notifications—each gets what it needs without dragging the other down. Lock-free max/min tracking uses a CAS loop to implement a lock-free "compare and update." The stop flag is the simplest atomic pattern, but combined with ``wait/notify``, it can also achieve low-latency stop signals. The spinlock is a classic teaching example, but should be used cautiously in production environments. +SeqLock allows readers to detect writer interference lock-free via sequence parity, suitable for "read-many-write-few, trivially copyable data" scenarios. Double-Checked Locking finally has a correct, portable implementation under the C++11 memory model—core is the `acquire` load and `release` store. The reference counting pattern demonstrates the combination of `relaxed` for increment and `acq_rel` for decrement—the former cares only about atomicity, the latter ensures visibility at destruction. The publish-subscribe flag separates relaxed count statistics from strict synchronization notifications—each gets what it needs without dragging the other down. Lock-free min/max tracking uses a CAS loop to implement lock-free "compare-and-update." The stop flag is the simplest atomic pattern, but combined with `atomic_wait`, it can also achieve low-latency stop signals. The spinlock is a classic teaching tool but should be used cautiously in production. -These patterns are not isolated—they are often used in combination. A SeqLock might use a spinlock internally to protect writers; a DCLP uses an acquire-release synchronization pair internally; the destruction of a reference-counted pointer might trigger a publish-subscribe notification. Understanding the core idea of each pattern and flexibly combining them in specific scenarios is the real goal. +These patterns are not isolated—they are often combined. A SeqLock might use a spinlock internally to protect writers; a DCLP uses an acquire-release synchronization pair internally; the destruction of a reference-counted pointer might trigger a publish-subscribe notification. Understanding the core idea of each pattern and flexibly combining them in specific scenarios is the real goal. -In the next chapter, we leave the atomic world of ch03 and move on to a new topic. But before that, we recommend completing the exercises in this chapter—especially the implementations of SeqLock and DCLP, as they are high-frequency interview topics and the litmus test for whether you truly understand memory orders. +The next chapter leaves the atomic world of ch03 and enters a new topic. But before that, I suggest doing the exercises in this chapter—especially the implementations of SeqLock and DCLP, as they are high-frequency topics in interviews and the touchstone for testing whether you truly understand memory ordering. ## Exercises -### Exercise 1: Implement a SeqLock +### Exercise 1: Implement SeqLock -Based on the ``SeqLock`` class above, write a complete program: one writer thread updates a struct containing three ``double`` fields at 10 ms intervals, and four reader threads each read and print the data at 1 ms intervals. After running for a while, observe whether the readers always obtain consistent data (the values of all three fields come from the same write). If the data is inconsistent (for example, the temperature is from the 5th write but the humidity is from the 6th), check whether your ``read_begin`` / ``read_validate`` are used correctly. +Based on the `SeqLock` class above, write a complete program: one writer thread updates a struct containing three `uint32_t` fields at 10ms intervals, and four reader threads read and print the data at 1ms intervals. Run for a while and observe if readers always obtain consistent data (values for all three fields come from the same write). If data is inconsistent (e.g., temperature is from the 5th write, but humidity is from the 6th), check if your `acquire`/`release` usage is correct. -### Exercise 2: Implement a DCLP Singleton +### Exercise 2: Implement DCLP Singleton -Use the DCLP pattern to implement a thread-safe configuration manager. Requirements: +Implement a thread-safe configuration manager using the DCLP pattern. Requirements: -1. Use the classic DCLP structure of ``std::atomic`` + ``std::mutex`` -2. Correctly use ``memory_order_acquire`` and ``memory_order_release`` in ``instance()`` -3. Write a multithreaded test: 8 threads simultaneously call ``ConfigManager::instance()``, verifying that all threads receive the same instance +1. Use the classic DCLP structure of `std::atomic` + `std::mutex` +2. Correctly use `memory_order_acquire` and `memory_order_release` in `get_instance` +3. Write a multi-threaded test: 8 threads call `get_instance` simultaneously, verifying that all threads get the same instance -Extra challenge: compare the performance difference between your DCLP implementation and a Meyers' Singleton (``static`` local variable) implementation. Use ``std::chrono`` to measure the time taken by both implementations under 1 million ``instance()`` calls. +**Extra Challenge**: Compare the performance of your DCLP implementation with Meyers' Singleton (`static` local variable). Use `std::chrono` to measure the time taken for 1 million `get_instance` calls under both implementations. ### Exercise 3: Lock-Free Minimum Tracker -Implement a ``MinTracker`` class that uses a CAS loop to track a minimum value of ``double`` type. Then use 4 threads to each generate random numbers and call ``update()``, and finally verify that ``get()`` indeed returns the minimum value among all numbers generated by the threads. +Implement a `MinTracker` class that uses a CAS loop to track a minimum value of type `double`. Then, have 4 threads generate random numbers and call `update`, finally verifying that the value returned by `get_min` is indeed the minimum of all numbers generated by the threads. -Hint: you need to be aware of whether atomic operations on floating-point numbers are lock-free on your current platform. Check with ``std::atomic::is_lock_free()``. If it is not lock-free, performance might not be as expected. +**Hint**: You need to check if atomic operations for floating-point numbers are lock-free on your current platform. Use `std::atomic::is_always_lock_free`. If not lock-free, performance may not be as expected. -> 💡 Complete example code is available at [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit ``code/volumn_codes/vol5/ch03-atomic-memory-model/``. +> 💡 Complete example code is available at [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit `exercises`. ## References diff --git a/documents/en/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md b/documents/en/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md index 9171fa29d..c9162d037 100644 --- a/documents/en/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md +++ b/documents/en/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md @@ -1,8 +1,8 @@ --- -title: 'Async Programming Evolution: From Callback Hell to Coroutines' -description: Tracing the evolution of the asynchronous programming paradigm—callbacks, - future chains, and coroutines—and understanding the motivation, pain points, and - implementation forms of each model in C++. +title: 'Asynchronous Programming Evolution: From Callback Hell to Coroutines' +description: Tracing the evolution of asynchronous programming paradigms—callbacks, + future chains, and coroutines—to understand the motivation, pain points, and implementation + forms of each model in C++. chapter: 6 order: 1 tags: @@ -27,219 +27,123 @@ related: - 异步 I/O 与事件循环 translation: source: documents/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md - source_hash: d0ffebacd5f4e338cd302c5f19546db80e39dbd6e5e1192ad00342c148a035c6 - translated_at: '2026-05-20T04:44:37.332058+00:00' + source_hash: 69bdb786dac2ba9a89659ceb3dbc19b6a9c686db20dd2b1800f19ad72d3bf599 + translated_at: '2026-06-13T11:51:48.247704+00:00' engine: anthropic - token_count: 3709 + token_count: 3751 --- -# Async Programming Evolution: From Callback Hell to Coroutines +# Evolution of Asynchronous Programming: From Callback Hell to Coroutines -Honestly, reaching this point in the series brings a sense of reflection. In previous chapters, we have been working closely with threads, locks, and atomic operations. These tools give us precise control—but the cost is that you must manage everything yourself. Thread creation and destruction, synchronization mechanism design, moving results back to the main thread, propagating exceptions—every concurrent task repeats this entire workflow. In ch05, we used `std::async` and `std::future` to simplify some of this work, but you will quickly discover a problem: when you need to chain multiple async operations—read a file, parse the data, then write back the result—managing future chains becomes incredibly clumsy. +> 📖 **Prerequisites**: This article uses C++20 coroutines. If you haven't yet encountered the underlying mechanisms of `co_await`, `co_yield`, and `co_return`, you might want to review [Volume 4 · Coroutine Basics](../../vol4-advanced/01-coroutine-basics.md) first—it breaks down how the "skeleton" of a coroutine is constructed from scratch. -This is the core problem async programming aims to solve: **how to elegantly organize and compose multiple async operations**. This is not a problem unique to C++; almost every language goes through the same evolution—from callbacks to future/promise chains, and finally to coroutines. In this chapter, we will trace this evolution from start to finish, examining the motivation behind each model, what problems it solves, what new problems it introduces, and ultimately why C++20 coroutines are widely considered "the right way to do async programming." +To be honest, writing this piece brings up some mixed feelings. In previous chapters, we dealt extensively with threads, locks, and atomic operations. These tools give us precise control—but the cost is that you have to manage everything yourself. Thread creation and destruction, synchronization mechanism design, moving results from worker threads back to the main thread, and exception propagation—every time you write a concurrent task, you repeat this entire process. In Chapter 5, we used `std::async` and `std::future` to simplify some of this work, but you quickly discover a limitation: when you need to chain multiple asynchronous operations—read a file, parse data, write back results—managing `future` chains becomes very clumsy. + +This is the core problem that asynchronous programming aims to solve: **how to elegantly organize and compose multiple asynchronous operations**. This problem isn't unique to C++; almost every language has undergone the same evolution—from callbacks to future/promise chains, and finally to coroutines. In this article, we will trace this evolution from start to finish, examining the motivation behind each model, the problems they solve, the new issues they introduce, and finally, why C++20 coroutines are widely considered "the right way to do asynchronous programming." ## Environment -Before we dive in, let us clarify the setup. All code in this chapter uses the pure standard library with no platform dependencies, so it runs on Linux, macOS, and Windows. On the compiler side, the callback and future sections only require C++11, but the coroutine examples need C++20 support—you will need GCC 12+, Clang 15+, or MSVC 19.34+, with the `-std=c++20 -Wall -Wextra` compiler flag added. Honestly, compiler support for C++20 coroutines has been quite mature since 2024, and the versions mentioned above can correctly compile the full set of coroutine language features. However, note one thing: the standard library's `` was only introduced in C++23 and is not yet fully supported across all implementations. Therefore, in this chapter's code, we use a hand-written generator type and do not rely on standard library headers. +Before we get our hands dirty, let's clarify the environment. All code in this article uses the pure standard library with no platform dependencies, so it runs on Linux, macOS, and Windows. Regarding compilers, the callback and `future` sections only require C++11, but the coroutine examples need C++20 support—you will need GCC 12+, Clang 15+, or MSVC 19.34+. Simply add the `-std=c++20` compiler flag. To be honest, compiler support for C++20 coroutines has been quite mature since 2024, and the versions mentioned above can correctly compile the full set of coroutine language features. However, note that `std::generator` was introduced in C++23, and not all implementations fully support it yet. Therefore, the code in this article uses a hand-written `generator` type and does not rely on standard library headers. ## A Scenario: 1000 Concurrent Connections -Let us start with a concrete scenario. Suppose you are writing a network server that needs to handle 1000 client connections simultaneously. The lifecycle of each connection roughly follows: accept connection → read request → process request → send response → close connection. Throughout this process, reading and writing are I/O operations, and I/O operations are slow—a single network read might take a few milliseconds or even hundreds of milliseconds. +Let's start with a concrete scenario. Suppose you are writing a network server that needs to handle 1000 client connections simultaneously. The lifecycle of each connection is roughly: accept connection → read request → process request → send response → close connection. Throughout this process, reading and writing are I/O operations, and I/O is slow—a single network read might take a few milliseconds or even hundreds of milliseconds. -The most intuitive approach is "one thread per connection": whenever a new connection comes in, we spawn a new thread dedicated to handling it. This approach is simple to write, but the problems are obvious—1000 connections mean 1000 threads. Each thread has its own stack (8MB by default on Linux), so just the stack space alone would consume nearly 8GB of memory. Furthermore, the operating system's overhead for scheduling 1000 threads is not trivial—context switches, cache invalidation, and lock contention all consume significant CPU time. More critically, these 1000 threads spend most of their time not computing, but waiting for I/O—waiting for data to arrive on the network card, waiting for the TCP buffer to free up space. While a thread is waiting for I/O, the memory and scheduling resources it occupies are entirely wasted. +The most intuitive approach is "one connection, one thread": whenever a new connection arrives, we spawn a new thread dedicated to handling it. This scheme is simple to write, but the problems are obvious—1000 connections mean 1000 threads. Each thread has its own stack (8MB by default on Linux), so just the stack space would consume nearly 8GB of RAM. Furthermore, the overhead of the OS scheduling 1000 threads is not negligible—context switches, cache invalidation, and lock contention all consume significant CPU time. More critically, these 1000 threads spend most of their time not computing, but waiting for I/O—waiting for data to arrive on the network card or for the TCP buffer to free up space. While a thread waits for I/O, the memory and scheduling resources it occupies are completely wasted. -This is the fundamental problem with synchronous blocking I/O: **threads waste resources while waiting for I/O, and you cannot repurpose those resources to do other work**. +This is the fundamental problem with synchronous blocking I/O: **threads occupy resources while waiting for I/O, and you cannot use those resources to do anything else**. -The core idea behind async programming is: do not let threads sit idle. When encountering an I/O operation, go do something else first, and come back to continue processing once the I/O completes. But "go do something else and come back later" is easy to say—how do we organize this at the code level? This is the question that the three models we are about to explore—callbacks, future chains, and coroutines—each answer in their own way. +The core idea of asynchronous programming is: don't let the thread wait stupidly. When you encounter an I/O operation, go do something else first, and come back to continue processing when the I/O is complete. But "go do something else and come back later" is easy to say, but how do we organize this at the code level? This is the question that the next three models—callbacks, future chains, and coroutines—each attempt to answer. -## The Callback Model: The Most Primitive Form of Async +## Callback Model: The Most Primitive Asynchrony -Let us start with the most intuitive approach—the callback model. The idea is straightforward: when you initiate an async operation, you also pass in a function (the callback), telling the system "call this function for me when the operation is done." +We start with the most intuitive approach—the callback model. The idea is straightforward: when you initiate an asynchronous operation, you also pass in a function (a callback), telling the system "call this function when the operation is complete." -Let us first get a feel for this with a simplified example. Suppose we want to implement the following flow: "asynchronously read file contents, then asynchronously process the data, and finally asynchronously write back the result." To avoid introducing a real async I/O library, we use `std::thread` to simulate async operations: +Let's use a simplified example to get a feel for it. Suppose we want to implement the flow: "asynchronously read file content, then asynchronously process the data, and finally asynchronously write the result back." To avoid introducing a real asynchronous I/O library, we use `std::thread` to simulate asynchronous operations: ```cpp -#include -#include -#include -#include - -// 模拟异步读取文件内容 -void async_read_file(const std::string& path, - std::function on_complete) -{ - std::thread([path, on_complete] { - // 模拟 I/O 延迟 - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - std::string content = "file content from " + path; - on_complete(content); - }).detach(); -} - -// 模拟异步处理数据 -void async_process(const std::string& input, - std::function on_complete) -{ - std::thread([input, on_complete] { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - std::string result = "processed(" + input + ")"; - on_complete(result); - }).detach(); -} - -// 模拟异步写回结果 -void async_write(const std::string& data, - std::function on_complete) -{ - std::thread([data, on_complete] { - std::this_thread::sleep_for(std::chrono::milliseconds(80)); - std::cout << " [write] 写入: " << data << "\n"; - on_complete(true); +void process_file_callback() { + std::thread([] { + // Step 1: Async read + std::string data = read_file(); + std::thread([data] { + // Step 2: Async process + std::string result = process(data); + std::thread([result] { + // Step 3: Async write + write_file(result); + }).detach(); + }).detach(); }).detach(); } - -int main() -{ - std::cout << "开始异步处理流程...\n"; - - async_read_file("data.txt", [](std::string content) { - std::cout << " [read] 读到: " << content << "\n"; - - async_process(content, [](std::string processed) { - std::cout << " [process] 结果: " << processed << "\n"; - - async_write(processed, [](bool success) { - std::cout << " [write] 写入" - << (success ? "成功" : "失败") << "\n"; - std::cout << "全部完成!\n"; - }); - }); - }); - - // 等待异步操作完成(仅用于演示,生产代码别这么干) - std::this_thread::sleep_for(std::chrono::seconds(1)); - return 0; -} ``` -Do you see the problem? Three levels of nested lambdas—this is the so-called **callback hell**. Every additional async step adds another level of indentation. If you have five or ten async operations, the code's readability drops drastically, and the indentation on the right runs right off the screen. Moreover, nesting doesn't just affect readability; the deeper issues lie in fragmented control flow, scattered error handling, and complex lifetime management—these are the real pain points of the callback model. +Do you see the problem? Three levels of nested lambdas—this is so-called **callback hell**. With every additional asynchronous step, the indentation goes deeper. If you have 5 or 10 steps, readability drops precipitously, and the indentation runs off the screen. Furthermore, the nesting affects more than just readability; the deeper issues are the fragmentation of control flow, scattered error handling, and complex lifetime management—these are the real pain points of the callback model. -> ⚠️ This code uses `detach()` to simplify the demonstration. In production code, you should use a thread pool or `join()` to manage thread lifetimes, rather than letting threads run unmanaged. +> ⚠️ This code uses `std::thread::detach` to simplify the demonstration. In production code, you should use a thread pool or `std::async` to manage the thread lifecycle, rather than letting threads run uncontrolled. -The pain points of the callback model go far beyond "too much indentation." Let us first discuss the issue of fragmented control flow—what was originally a linear process (read, process, write) gets torn into three independent functions, where each function only knows its own piece of logic. You cannot see the order of the entire flow at a glance, because the order is hidden within the nested callback registrations. When you need to understand "how does the entire flow run," you have to start from the outermost callback and jump inward layer by layer—this is completely different from the cognitive model of reading normal sequential code. +The pain points of the callback model go far beyond "indentation too deep." First, let's discuss the fragmentation of control flow—a process that was originally linear (read, process, write) is split into three independent functions, each knowing only its own piece of logic. You cannot see the order of the entire flow at a glance because the order is hidden in the nested callback registrations. When you need to understand "how the whole flow runs," you have to start from the outermost callback and jump in layer by layer—this is completely different from the cognitive model of reading normal sequential code. -Next is the error handling problem. Every step can fail, and the callback model has no unified error handling mechanism. You typically need to check the previous step's result in each callback, then decide whether to continue or report an error. If you have five steps, you write five pieces of error handling code, and these error handling logic blocks are also nested and fragmented. Without a centralized error handling mechanism like `try/catch`, you can only fend for yourself in each callback. +Next is the error handling problem. Every step can fail, and the callback model lacks a unified error handling mechanism. You usually need to check the result of the previous step in each callback and decide whether to continue or report an error. If there are 5 steps, you write 5 pieces of error handling code, and these error handling logics are also nested and fragmented. Without a centralized error handling mechanism like `try-catch`, you can only fight on your own in each callback. -The trickiest part is actually lifetime management. A callback is a closure that captures variable references from the outer scope. What if those variables have already been destroyed by the time the callback is invoked asynchronously? Dangling references, use-after-free—these bugs are especially prone to appearing in the callback model. You also have to worry about whether a callback gets called multiple times, whether it never gets called at all, and how to propagate exceptions out of a callback—these problems simply do not exist in synchronous code, but in the callback model, you must handle them one by one. +The trickiest part is actually lifetime management. A callback is a closure that captures references to variables in the outer scope. What if those variables are invalid when the callback is asynchronously invoked? Dangling references, use-after-free—these bugs are particularly prone to occur in the callback model. You also have to worry about whether the callback was called multiple times, or not called at all, and how to propagate exceptions out of the callback—these problems don't exist in synchronous code at all, but in the callback model, you must handle them one by one. -Frankly, the callback model uses "function pointers" to express "what to do next," but a function pointer is a low-level primitive—it has no composability, no error propagation, and no resource management. This is why all languages are looking beyond callbacks for better solutions. +Basically, the callback model uses "function pointers" to express "what to do next," but a function pointer is a low-level primitive—it lacks composability, error propagation, and resource management. This is why all languages are looking for better solutions beyond callbacks. -## Future/Promise Chains: A Step Up from Callbacks +## Future/Promise Chains: A Bit Better Than Callbacks -Now that we have seen the pain points of callbacks, let us look at the second approach—the future/promise model. It is the first layer of improvement over callbacks, with the core idea being: an async operation returns a `future`—a token representing "a value that will be available at some point in the future." You can block and wait for the result via `get()`, or register a follow-up operation to execute "when the value is ready" through some mechanism. +Now that we've seen the pain points of callbacks, let's look at the second approach—the future/promise model. It is the first layer of improvement over callbacks. The core idea is: an asynchronous operation returns a `std::future`—a voucher representing "a value that will exist at some point in the future." You can use `get()` to block waiting for the result, or use some method to register a follow-up operation to be executed "when the value is ready." -C++11 introduced `std::future` and `std::promise`, but the standard library's `std::future` has a major limitation: **it does not support `.then()` —meaning you cannot directly register a follow-up operation on a future**. If you want to implement "asynchronously read a file, then process the data," you have to manually orchestrate it: +C++11 introduced `std::future` and `std::promise`, but the standard library's `std::future` has a major limitation: **it does not support continuations (`.then()`)**—that is, you cannot directly register a follow-up operation on a future. If you want to implement "read file asynchronously, then process data," you have to orchestrate it manually: ```cpp -#include -#include -#include -#include - -// 模拟异步读取文件 -std::future async_read_file(const std::string& path) -{ - return std::async(std::launch::async, [&path] { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - return "file content from " + path; - }); -} - -// 模拟异步处理数据 -std::future async_process(const std::string& input) -{ - return std::async(std::launch::async, [&input] { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - return "processed(" + input + ")"; - }); -} - -// 模拟异步写入 -std::future async_write(const std::string& data) -{ - return std::async(std::launch::async, [&data] { - std::this_thread::sleep_for(std::chrono::milliseconds(80)); - std::cout << " [write] 写入: " << data << "\n"; - return true; - }); -} - -int main() -{ - std::cout << "开始 future 链式处理...\n"; - - // 第一步:异步读取文件 - std::future f1 = async_read_file("data.txt"); - - // 手动编排链式调用——等待 f1 完成,然后启动下一步 - // 注意:标准 std::future 没有 .then(),只能手动串联 - std::string content = f1.get(); - std::cout << " [read] 读到: " << content << "\n"; - - // 第二步:处理数据 - std::future f2 = async_process(content); - std::string processed = f2.get(); - std::cout << " [process] 结果: " << processed << "\n"; - - // 第三步:写入结果 - std::future f3 = async_write(processed); - bool success = f3.get(); - std::cout << " [write] 写入" << (success ? "成功" : "失败") << "\n"; - std::cout << "全部完成!\n"; - - return 0; +void process_file_future_blocking() { + // Step 1: Async read + std::future read_future = std::async([] { return read_file(); }); + std::string data = read_future.get(); // Block until read completes + + // Step 2: Async process + std::future process_future = std::async([data] { return process(data); }); + std::string result = process_future.get(); // Block until processing completes + + // Step 3: Async write + std::future write_future = std::async([result] { write_file(result); }); + write_future.get(); // Block until write completes } ``` -You will notice that the nesting in this code is gone—each async step is linear: first `get()` to get the previous step's result, then start the next step. Compared to the callback model, future chains offer a clear improvement in readability: the control flow changes from a "nested callback pyramid" to a "flat linear sequence." +You will notice that the nesting in this code has disappeared—each asynchronous step is linear: first use `get()` to get the result of the previous step, then start the next step. Compared to the callback model, the future chain has significantly improved readability: the control flow has changed from a "nested callback pyramid" to a "flat linear sequence." -But the problem is also obvious: **the main thread blocks at every step**. `f1.get()` blocks until the file read completes, and `f2.get()` blocks until the processing finishes—how is this any different from synchronous code? If you want to truly achieve the effect of "the main thread doesn't block, and async steps are automatically chained," you need `.then()`—automatically invoking the registered function after the future's value becomes ready, returning a new future and forming a chain. +But the problem is also obvious: **the main thread blocks at every step**. `read_future.get()` blocks until the file is read, `process_future.get()` blocks until processing is complete—how is this different from synchronous code? If you want to truly achieve "non-blocking main thread, automatic chaining of asynchronous steps," you need continuations (`.then()`)—automatically calling the registered function when the future's value is ready, returning a new future, forming a chain call. -`std::future::then()` first appeared in C++'s Concurrency TS (Technical Specification) as part of `std::experimental::future`, and Boost.Asio's `boost::future` also implements full `.then()` support. However, the Concurrency TS was ultimately not merged into the international C++ standard—as of C++23, the standard `std::future` still lacks `.then()`. The C++ committee's stance is: rather than patching `std::future`, it is better to push the Sender/Receiver model (proposal P2300, i.e., `std::execution`, which was officially merged into the C++26 working draft at the St. Louis meeting in July 2024). So in standard C++, although `std::execution` is just around the corner, chaining with the current `std::future` remains a clumsy affair. +`.then()` first appeared in C++'s Concurrency TS (Technical Specification) as part of `std::future`, and Boost.Asio's `std::experimental::future` also implements complete continuation support. However, Concurrency TS was ultimately not merged into the C++ international standard—as of C++23, the standard `std::future` still lacks `.then()`. The C++ Committee's attitude is: rather than patching `std::future`, it's better to push the Sender/Receiver model (proposal P2300, i.e., `std::execution`, which was officially merged into the C++26 working draft at the St. Louis meeting in July 2024). So in standard C++, although `std::execution` is coming, chaining with the current `std::future` remains a clumsy task. -> ⚠️ If you need future chaining, you can refer to Boost.Asio's `boost::future::then()`, or use third-party libraries like `thousandeyes-futures`. But the standard C++ `std::future` temporarily lacks this capability. +> ⚠️ If you need future chaining, you can refer to Boost.Asio's `awaitable` or use third-party libraries like `cppcoro`. But standard C++'s `std::future` currently lacks this capability. -Future/Promise chains are indeed an improvement over callbacks, but they introduce their own problems. A future itself involves heap allocation—each future internally has a shared state, used to pass values and exceptions between the write end (promise/async) and the read end (future). This shared state is typically heap-allocated, so when you chain multiple futures, you incur multiple heap allocations. Exception propagation is also not very intuitive—if a step in the chain throws an exception, the exception is caught and stored in the future's shared state, only to be re-thrown when you call `get()`. This means you must check for exceptions at every step, otherwise subsequent steps in the chain might start in an exceptional state. +Future/Promise chains are certainly an improvement over callbacks, but they introduce their own problems. Futures themselves involve heap allocation—every future has a shared state internally, used to pass values and exceptions between the write end (promise/async) and the read end (future). This shared state is usually heap-allocated, so when you link multiple futures, you have multiple heap allocations. Exception propagation is also not very intuitive—if a step in the chain throws an exception, the exception is caught and stored in the future's shared state, only to be re-thrown when you call `get()`. This means you must check for exceptions at every step, otherwise subsequent steps in the chain might start in an exceptional state. -## Coroutines: Writing Async Code Like Sync Code +## Coroutines: Writing Asynchronous Code Like Synchronous Code -Callbacks are too fragmented, and future chains are too clumsy—so is there a way to make async code **read exactly like synchronous code**, while executing asynchronously? In other words, the code looks like a linear flow: read file, process, write back, with no callbacks, no nesting, and no manual orchestration, but the underlying execution is automatically async? +Callbacks are too fragmented, and future chains are too clumsy. Is there a way to make asynchronous code **look exactly like synchronous code**, but execute asynchronously? That is, the code looks like a linear flow: read file, process, write back, with no callbacks, no nesting, no manual orchestration, but the underlying execution is automatically asynchronous? -This is the core selling point of C++20 coroutines. Let us look at the code first, and then explain what it does. The following code implements the same "read → process → write back" flow as before, but using the coroutine style. +This is the core selling point of C++20 coroutines. Let's look at the code first, then explain what it does. The following code implements the same "read → process → write back" flow as before, but using the coroutine style. -Do not be intimidated by the amount of code—we will break it down from the start. The first piece is the `Task` struct, which defines the coroutine's return type. C++20 coroutines require the return type to internally contain a nested type called `promise_type`, which the compiler uses to customize various behavioral policies of the coroutine. You can see that `promise_type` contains several functions with fixed names: `get_return_object()` creates the Task object returned to the caller, `initial_suspend()` determines whether the coroutine suspends at the very beginning (here it returns `suspend_never`, meaning the coroutine starts executing immediately), `final_suspend()` determines the behavior after the coroutine finishes (returning `suspend_always` means the coroutine suspends there after finishing, waiting for external destruction), `return_void()` handles the case of `co_return` or normal function completion, and `unhandled_exception()` handles uncaught exceptions. These functions form the basic skeleton of the coroutine lifecycle. +Don't be intimidated by the amount of code—we'll break it down from the beginning. The first block is the `Task` struct, which defines the return type of the coroutine. C++20 coroutines require that the return type internally contains a nested type named `promise_type`. The compiler customizes various behavior policies of the coroutine through this type. You see several fixed-name functions inside `promise_type`: `get_return_object` creates the `Task` object returned to the caller; `initial_suspend` determines whether the coroutine suspends at the very beginning (here it returns `std::suspend_never`, meaning the coroutine starts executing immediately); `final_suspend` determines the behavior after the coroutine ends (returns `std::suspend_always`, meaning the coroutine suspends there after completion, waiting for external destruction); `return_value` handles the `co_return` or normal function end; `unhandled_exception` handles uncaught exceptions. These functions constitute the basic skeleton of the coroutine lifecycle. -Next are three awaitable types—`AsyncRead`, `AsyncProcess`, and `AsyncWrite`. Each implements three key functions: `await_ready()` returns `false` indicating "the operation is not yet complete, needs to suspend"; `await_suspend()` is called when the coroutine suspends, where we start a new thread to simulate async I/O, and the thread calls `h.resume()` to resume the coroutine upon completion; `await_resume()` is called when the coroutine resumes, and its return value becomes the result of the `co_await` expression. You will notice that each awaitable is essentially a "descriptor for an async operation"—it tells the coroutine "when the operation will be ready," "what to do when suspending," and "what result to give you when resuming." +Next are three awaitable types—`AsyncRead`, `AsyncProcess`, `AsyncWrite`. Each implements three key functions: `await_ready` returns `false` to indicate "the operation is not complete yet, needs to suspend"; `await_suspend` is called when the coroutine suspends—here we start a new thread to simulate asynchronous I/O, and the thread calls `coroutine_handle::resume` to resume the coroutine when done; `await_resume` is called when the coroutine resumes, and its return value becomes the result of the `co_await` expression. You will find that each awaitable is essentially a "descriptor for an asynchronous operation"—it tells the coroutine "when the operation is ready," "what to do when suspending," and "what result to give when resuming." -Finally, there is the `process_file` coroutine function. Look at this code—if you ignore the `co_await` keyword, it looks no different from an ordinary synchronous function. A linear flow, stepping through one by one, no callbacks, no nesting, no `get()` blocking. But its execution is asynchronous: whenever it encounters `co_await`, the coroutine suspends, control returns to the caller, and the underlying thread can go do other things; when the async operation completes, the coroutine resumes from the suspension point and continues executing. +Finally, there is the `process_file_coroutine` coroutine function. Look at this code—if you ignore the `co_await` keyword, it looks no different from a normal synchronous function. Linear flow, step by step, no callbacks, no nesting, no `get()` blocking. But its execution is asynchronous: whenever it encounters `co_await`, the coroutine suspends, control is returned to the caller, and the underlying thread can go do other things; when the asynchronous operation completes, the coroutine resumes from the suspension point and continues executing. ```cpp #include -#include -#include #include -#include - -// ---- 一个最简的协程任务类型 ---- - -struct Task -{ - struct promise_type - { - Task get_return_object() - { - return Task{std::coroutine_handle::from_promise( - *this)}; +#include +#include + +// Custom coroutine return type +struct Task { + struct promise_type { + Task get_return_object() { + return Task{std::coroutine_handle::from_promise(*this)}; } std::suspend_never initial_suspend() { return {}; } std::suspend_always final_suspend() noexcept { return {}; } @@ -247,124 +151,93 @@ struct Task void unhandled_exception() { std::terminate(); } }; - std::coroutine_handle handle; -}; - -// ---- 模拟异步操作的 awaitable ---- - -struct AsyncRead -{ - std::string path; - std::string result; + std::coroutine_handle h; + Task(std::coroutine_handle handle) : h(handle) {} + ~Task() { if (h && !h.done()) h.destroy(); } - bool await_ready() { return false; } // 总是挂起 + // Non-copyable + Task(const Task&) = delete; + Task& operator=(const Task&) = delete; +}; - void await_suspend(std::coroutine_handle<> h) - { - // 在新线程上模拟异步 I/O - std::thread([this, h] { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - result = "file content from " + path; - h.resume(); // I/O 完成,恢复协程 +// Awaitable: Async Read +struct AsyncRead { + std::string data; + bool await_ready() { return false; } // Always suspend + void await_suspend(std::coroutine_handle<> handle) { + std::thread([handle, this] { + data = read_file(); // Simulate async I/O + handle.resume(); }).detach(); } - - std::string await_resume() { return std::move(result); } + std::string await_resume() { return data; } }; -struct AsyncProcess -{ +// Awaitable: Async Process +struct AsyncProcess { std::string input; std::string result; - bool await_ready() { return false; } - - void await_suspend(std::coroutine_handle<> h) - { - std::thread([this, h] { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - result = "processed(" + input + ")"; - h.resume(); + void await_suspend(std::coroutine_handle<> handle) { + std::thread([handle, this] { + result = process(input); + handle.resume(); }).detach(); } - - std::string await_resume() { return std::move(result); } + std::string await_resume() { return result; } }; -struct AsyncWrite -{ - std::string data; - +// Awaitable: Async Write +struct AsyncWrite { + std::string content; bool await_ready() { return false; } - - void await_suspend(std::coroutine_handle<> h) - { - std::thread([this, h] { - std::this_thread::sleep_for(std::chrono::milliseconds(80)); - std::cout << " [write] 写入: " << data << "\n"; - h.resume(); + void await_suspend(std::coroutine_handle<> handle) { + std::thread([handle, this] { + write_file(content); + handle.resume(); }).detach(); } - - bool await_resume() { return true; } + void await_resume() {} }; -// ---- 协程函数:看起来跟同步代码一模一样 ---- - -Task process_file(const std::string& path) -{ - std::cout << "开始处理 " << path << "...\n"; - - // co_await:挂起协程,等待异步操作完成 - std::string content = co_await AsyncRead{path}; - std::cout << " [read] 读到: " << content << "\n"; - - std::string processed = co_await AsyncProcess{content}; - std::cout << " [process] 结果: " << processed << "\n"; - - bool success = co_await AsyncWrite{processed}; - std::cout << " [write] 写入" << (success ? "成功" : "失败") << "\n"; - - std::cout << "全部完成!\n"; -} - -int main() -{ - process_file("data.txt"); - - // 等待异步操作完成(演示用) - std::this_thread::sleep_for(std::chrono::seconds(1)); - return 0; +// Coroutine function +Task process_file_coroutine() { + // Step 1: Async read + std::string data = co_await AsyncRead{}; + // Step 2: Async process + std::string result = co_await AsyncProcess{data}; + // Step 3: Async write + co_await AsyncWrite{result}; } ``` -This is the magic of coroutines: **async code is written as straightforwardly as sync code, but the execution model is fully asynchronous**. +This is the magic of coroutines: **asynchronous code is written as straightforwardly as synchronous code, but the execution model is fully asynchronous**. -Looking back, C++20 introduced a total of three keywords for coroutines. `co_await expr` suspends the current coroutine, waits for the async operation represented by `expr` to complete, and the operation's result becomes the return value of the `co_await` expression—this is the one we use the most, and every async operation in the example above suspends and resumes through it. `co_yield expr` yields a value and suspends the coroutine—this is the foundation of generators, which we will see later. `co_return expr` returns a value and ends the coroutine. As long as any of these three keywords appears in the function body, the compiler treats it as a coroutine—no special function declarations or modifiers are needed. This design is indeed elegant. +Looking back, C++20 introduced three keywords for coroutines. `co_await` suspends the current coroutine, waiting for the asynchronous operation represented by the awaitable to complete, and the result of the operation becomes the return value of the `co_await` expression—this is what we use most often, and every asynchronous operation in the example above uses it to suspend and resume. `co_yield` yields a value and suspends the coroutine—this is the foundation of generators, which we will see later. `co_return` returns a value and ends the coroutine. As long as any of these three keywords appears in the function body, the compiler treats it as a coroutine—no special function declaration or modifiers are needed. This design is indeed elegant. -> ⚠️ Coroutine return types have strict requirements: they must contain a nested `promise_type` type. The compiler uses this `promise_type` to customize various coroutine behaviors. We will dive deep into this mechanism in the next chapter. +> ⚠️ Coroutine return types have strict requirements: they must contain a nested `promise_type`. The compiler customizes various coroutine behaviors through this `promise_type`. We will dissect this mechanism in depth in the next article. -In this example, we hand-wrote `Task`, `AsyncRead`, `AsyncProcess`, and `AsyncWrite`—these helper types make the code look substantial. But in real projects, this infrastructure is typically provided by frameworks (such as Boost.Asio's `awaitable` or cppcoro's `task`), and you only need to write the linear logic inside `process_file`. C++20 coroutines provide a language-level mechanism, while libraries are responsible for providing easy-to-use wrappers—this is a combined design of "language feature + library support." +In this example, we hand-wrote `Task`, `AsyncRead`, `AsyncProcess`, and `AsyncWrite` helper types, which looks like a lot of code. But in actual projects, this infrastructure is usually provided by frameworks (like Boost.Asio's `awaitable` or cppcoro's `task`), and you only need to write the linear logic inside the coroutine function. C++20 coroutines provide a language-level mechanism, and the library is responsible for providing easy-to-use wrappers—this is a "language feature + library support" design. -## Comparing the Three Models +## Comparison of the Three Models -Now let us look at all three models side by side. +Now let's look at the three models together. -The callback model produces the most "fragmented" code—the linear flow is torn into nested callback functions, and the control flow is no longer a straight top-to-bottom line but jumps around following callback registration relationships. Error handling must be dealt with individually in each callback, and there is no unified exception propagation mechanism. However, callbacks themselves have almost zero runtime overhead—they are essentially just a function pointer plus a captured closure, making them the highest in performance. But debugging a callback chain is a nightmare: the call stack is broken. When something goes wrong in the fifth level of callbacks, your debugger can only see that single callback's stack frame; all the calling relationships above are lost. +The callback model code is the most "fragmented"—the linear flow is split into nested callback functions, and the control flow is no longer a straight line from top to bottom but jumps according to callback registration relationships. Error handling needs to be handled separately in each callback, and there is no unified exception propagation mechanism. However, callbacks themselves have almost no runtime overhead—they are essentially just a function pointer plus a captured closure, so performance is the highest. But debugging a callback chain is a nightmare: the call stack is broken, and when the 5th layer callback has a problem, your debugger can only see that one callback's stack frame; the calling relationships above are all lost. -Future/Promise chains are much better than callbacks in terms of readability. Through `.then()` (or manual `get()` chaining), the flow can be written as a linear chain of calls. Exceptions automatically propagate through the future's shared state—if a step throws an exception, it travels along the chain to the final `get()` call. But performance-wise, there is an overhead that cannot be ignored: every future involves a heap allocation (for the shared state), so chaining ten async operations means ten heap allocations. Debugging difficulty is moderate—at least the call stack is continuous, but future chain error messages are usually not very friendly; you see a `std::future_error` rather than which specific step in the chain failed and why. +Future/Promise chains are much better than callbacks in terms of readability. Through `.then()` (or manual `get()` chaining), the flow can be written as a linear chain call. Exceptions propagate automatically through the future's shared state—if a step throws an exception, it travels along the chain to the final `get()` call. But performance-wise, there is a non-negligible overhead: every future involves a heap allocation (shared state), so when you link 10 asynchronous operations, that's 10 heap allocations. Debugging difficulty is moderate—at least the call stack is continuous, but future chain error messages are usually not very friendly; you see a `broken_promise`, not what specifically went wrong at which step in the chain. -Coroutines offer the best readability of the three models—a coroutine function looks exactly like a synchronous function, the control flow is linear, and the cognitive burden of reading and understanding it is the lowest. For error handling, you can use `try/catch`, and exceptions propagate normally within the coroutine, behaving exactly like synchronous code. Performance-wise, coroutine frames are typically heap-allocated, but the compiler can perform "coroutine elision" optimizations, embedding the frame into the caller's stack frame. Each suspension point only involves saving/restoring registers and the coroutine state, which is much lighter than a thread context switch. The debugging experience is close to that of synchronous code—the call stack is complete, and you can set a breakpoint on the `co_await` line; when the coroutine resumes execution, the debugger will correctly stop there. +Coroutines are the best of the three models in terms of readability—the coroutine function looks exactly like a synchronous function, the control flow is linear, and the cognitive burden of reading and understanding is the lowest. Error handling can use `try-catch`, and exceptions propagate normally within the coroutine, behaving exactly like synchronous code. Performance-wise, coroutine frames are usually heap-allocated, but the compiler can perform "coroutine elision" optimization to embed the frame into the caller's stack frame. Each suspension point only involves saving/restoring registers and coroutine state, which is much lighter than a thread context switch. The debugging experience is close to synchronous code—the call stack is complete, you can set a breakpoint on the `co_await` line, and the debugger will correctly stop there when the coroutine resumes execution. -But coroutines also come with their own cost—the C++20 coroutine mechanism is quite complex. The collaborative relationships among `promise_type`, `coroutine_handle`, `awaitable`, and `awaiter` require time to understand. The compiler performs extensive transformations on coroutine functions, and if something goes wrong, the error messages can be very cryptic. The good news is that once you understand this mechanism, using it feels very natural—and in the next chapter, we will dive deep into breaking down this mechanism. +But coroutines also have their costs—the C++20 coroutine mechanism is quite complex. `co_await`, `co_yield`, `co_return`, `promise_type`, the collaboration between these concepts requires time to understand. The compiler performs massive transformations on coroutine functions, and if something goes wrong, the error messages can be very obscure. The good news is that once you understand the mechanism, using it feels very natural—and in the next article, we will dissect this mechanism in depth. ## Where We Are -In this chapter, we traveled three stops along the evolutionary path of async programming. The callback model uses function pointers to express "what to do next"—simple but fragmented, with readability and maintainability dropping drastically as nesting deepens. Future/Promise chains replace nested callbacks with "value containers + chain composition," making the control flow linear, but standard C++'s `std::future` lacks `.then()` support (the Concurrency TS's `.then()` was never merged into the international standard), making chain composition still clumsy, and every future incurs a heap allocation overhead. Coroutines make async code read as straightforwardly as sync code—C++20 provides coroutine support at the language level through three keywords: `co_await`/`co_yield`/`co_return`, and the underlying suspend/resume mechanism is jointly implemented by the compiler and the promise_type. +In this article, we walked through three stops along the evolutionary path of asynchronous programming. The callback model uses function pointers to express "what to do next"—simple but fragmented, and readability and maintainability drop precipitously as nesting deepens. Future/Promise chains replace nested callbacks with "value containers + chain composition," making the control flow linear, but standard C++'s `std::future` lacks `.then()` support (the Concurrency TS's `.then()` was never merged into the international standard), chain composition remains clumsy, and every future incurs a heap allocation overhead. Coroutines make asynchronous code read as straightforwardly as synchronous code—C++20 provides coroutine support at the language level through three keywords: `co_await`/`co_yield`/`co_return`, and the underlying suspend/resume mechanism is implemented jointly by the compiler and `promise_type`. -But "looking simple" does not mean "simple underneath." The internal mechanism of C++20 coroutines is quite intricate—the compiler transforms a coroutine function into a state machine, where each `co_await` is a state transition point; `promise_type` customizes the coroutine's various behavioral policies; and `coroutine_handle` is a non-owning handle to the coroutine frame, responsible for resuming and destroying it. In the next chapter, we will tear this mechanism apart inside and out: what transformations does the compiler actually perform on a coroutine function? What is stored inside the coroutine frame? How does `coroutine_handle` manage lifetimes? We will also implement an integer generator from scratch that supports `co_yield`, tying all the concepts together. +But "looking simple" doesn't mean "simple behind the scenes." The internal mechanism of C++20 coroutines is quite ingenious—the compiler transforms the coroutine function into a state machine, every `co_await` is a state transition point; `promise_type` customizes the coroutine's various behavior policies; `coroutine_handle` is a non-owning handle to the coroutine frame, responsible for resumption and destruction. In the next article, we will dissect this mechanism inside and out: What exactly does the compiler do to the coroutine function? What is stored in the coroutine frame? How does `coroutine_handle` manage the lifecycle? We will also implement a `generator` that can `co_yield` integers from scratch, tying all the concepts together. -> 💡 The complete example code is in [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit `code/volumn_codes/vol5/ch06-async-io-coroutine/`. +> 💡 Complete example code is available in [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit `vol5-async`. ## References diff --git a/documents/en/vol5-concurrency/ch07-actor-channel/index.md b/documents/en/vol5-concurrency/ch07-actor-channel/index.md index 07e932a43..a701a72ff 100644 --- a/documents/en/vol5-concurrency/ch07-actor-channel/index.md +++ b/documents/en/vol5-concurrency/ch07-actor-channel/index.md @@ -1,24 +1,25 @@ --- title: Actor Model and CSP -description: Exploring the "shared-nothing" concurrency paradigm — message passing - in the Actor model and channel communication in CSP +description: Exploring the "shared-nothing" concurrency paradigm—message passing in + the Actor model and channel communication in CSP translation: source: documents/vol5-concurrency/ch07-actor-channel/index.md source_hash: 3e41c426a720badc99a32416c29acdb379053566adaa7f22ff98abcf7c77ce70 - translated_at: '2026-05-20T04:48:45.124964+00:00' + translated_at: '2026-06-13T11:55:25.310489+00:00' engine: anthropic token_count: 197 --- -#柯达明锐PRO的内饰设计确实展现出了大众集团在紧凑型轿车领域的成熟功底。整体风格偏向实用与规整,但在细节处又融入了诸多提升科技感与质感的元素。 +# The Actor Model and CSP -最引人注目的当属中控台上的悬浮式大屏,这不仅紧跟当下的智能化潮流,也有效拉伸了车内的视觉宽度。屏幕下方的触控式空调面板虽然在一定程度上牺牲了盲操的便利性,但极大地提升了整体的美观度和科技氛围。 +In previous chapters, we used tools like mutexes, atomics, and futures to protect shared state and coordinate execution order between threads. However, shared memory with locks is just one paradigm of concurrent programming—another school of thought advocates "don't share memory," using message passing to replace locks. -方向盘与仪表盘的组合也是一大亮点。多功能方向盘握感扎实,按键布局合理,配合全液晶数字仪表盘,能够清晰地显示丰富的行车信息,驾驶者可以根据个人喜好切换不同的显示模式,互动体验出色。 +In this chapter, we dive into two "shared-nothing" concurrency models: the Actor model and CSP (Communicating Sequential Processes). The Actor model, proposed by Carl Hewitt in 1973, organizes concurrency using identified Actors and asynchronous message passing, and has been proven at scale in Erlang and Akka. CSP, proposed by Tony Hoare in 1978, connects independent sequential processes using anonymous channels; Go's goroutine + channel is the classic implementation of CSP. -在用料方面,明锐PRO也展现出了足够的诚意。中控台上方采用了大面积的软性材质包裹,触感细腻;门板处也运用了软性材质与皮质包裹的拼接工艺,辅以细腻的缝线处理,整体质感在同级别车型中属于上乘水平。 +We will use C++ to implement the core components of an Actor framework (mailbox, message loop, supervisor) and Go-like communication pipelines (buffered/unbuffered, close semantics, select mode) from scratch. We will understand their design motivations and implementation principles, and discuss how to choose the appropriate concurrency abstraction in real-world projects. -空间方面,得益于其掀背式的车身设计,明锐PRO的后排头部和腿部空间都表现得十分宽裕。后备箱的开口巨大,装载能力极强,这也是斯柯达品牌一直以来的核心卖点之一,对于家庭用户来说非常实用。 +## Chapter Contents -当然,内饰设计也并非完美无缺。比如部分硬塑料的使用依然存在,车机系统的流畅度和功能丰富度虽然够用,但与同级别一些主打智能化的自主品牌车型相比,在UI设计和生态拓展方面还有提升的空间。 - -总体而言,斯柯达明锐PRO的内饰在传承大众系实用主义基因的基础上,通过悬浮大屏、全液晶仪表等配置成功营造出了更强的科技感与高级感,用料扎实,空间实用,是一套非常均衡且符合大众审美的内饰设计方案。 + + The Actor Model and Message Passing + Channel and the CSP Model + diff --git a/documents/en/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md b/documents/en/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md index 92e770296..bfc8a2230 100644 --- a/documents/en/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md +++ b/documents/en/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md @@ -1,7 +1,7 @@ --- -title: Concurrency Performance Testing and Benchmarks +title: Concurrency Performance Testing and Benchmarking description: Master the usage of Google Benchmark, avoid common pitfalls in concurrent - benchmarking, and learn to use performance counters to pinpoint bottlenecks. + benchmarking, and learn to use performance counters to locate bottlenecks. chapter: 8 order: 2 tags: @@ -25,80 +25,62 @@ related: - CPU cache 与 OS 线程 translation: source: documents/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md - source_hash: affc82a449231135acc36f7d7c00bc0aca70c6dc83b95c57c99da4ee44494c82 - translated_at: '2026-05-20T04:49:33.543564+00:00' + source_hash: cec196de6157ff04ef51de1d14d828f4ea56457f99f926eaa2d0894e6f54d349 + translated_at: '2026-06-13T11:52:15.714726+00:00' engine: anthropic - token_count: 4220 + token_count: 4251 --- # Concurrency Performance Testing and Benchmarking -In the previous article, we tackled correctness—using TSan to catch data races, Helgrind to check lock ordering, and Clang TSA to prevent thread safety violations at compile time. However, a correct concurrent program is not necessarily an efficient one. We have seen too many scenarios like this: someone spends three days replacing a mutex with a lock-free queue, excitedly announces a "3x performance boost," but a look at the benchmark methodology reveals a single run, no warmup, the compiler almost optimized away the entire loop, and not even `UseRealTime` was added. The "3x boost" you measured might just be measurement noise. +> 📖 **Deep Dive**: This article focuses on benchmarking in concurrent scenarios. For more general performance engineering—benchmarking methodology, cache friendliness, SIMD/AVX, and assembly reading—check out [Volume 6: Performance Engineering](../../vol6-performance/index.md). -The core question we will address in this article is: how do we scientifically measure the performance of concurrent programs? We will start with the basics of Google Benchmark, then dive into the design pitfalls of concurrent benchmarking (there are far more traps than you might imagine), walk through a real-world case comparing the actual performance differences of various synchronization schemes, and finally introduce `perf stat`, a Linux performance counter tool that can tell you exactly why your program is slow. +In the previous article, we solved the correctness problem—using TSan to catch data races, Helgrind to check lock order, and Clang TSA to prevent thread safety violations at compile time. However, a correct concurrent program is not necessarily an efficient concurrent program. I have seen too many scenarios where someone spends three days replacing a mutex with a lock-free queue, excitedly announcing a "3x performance boost," only to find the benchmark methodology flawed: a single run, no warm-up, the compiler optimizing away the entire loop, and even missing `DoNotOptimize`. The "3x boost" you measured might just be measurement error. + +In this article, our core problem to solve is: how to scientifically measure the performance of concurrent programs. We will start with the basic usage of Google Benchmark, then dive into the design traps of concurrent benchmarking (there are more pitfalls than you can imagine), followed by a real-world case study comparing the real performance differences of different synchronization schemes. Finally, we will introduce `perf stat`, a performance counter tool on Linux that can tell you exactly where your program is slow. ## Google Benchmark Basics ### Installation -Google Benchmark (referred to as GBench below) is the most mainstream microbenchmarking framework in the C++ ecosystem, open-sourced and maintained by Google. There are several ways to install it, but the easiest is using CMake's FetchContent: +Google Benchmark (hereinafter referred to as GBench) is the most mainstream micro-benchmarking framework in the C++ ecosystem, open-sourced and maintained by Google. There are several ways to install it; the simplest is using CMake's FetchContent: ```cmake -cmake_minimum_required(VERSION 3.20) -project(concurrency_benchmarks CXX) - -set(CMAKE_CXX_STANDARD 17) - +# In your CMakeLists.txt include(FetchContent) FetchContent_Declare( - benchmark - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.9.0 + benchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.8.3 ) -# 不让 GBench 自己跑测试,节省编译时间 -set(BENCHMARK_ENABLE_TESTING OFF) FetchContent_MakeAvailable(benchmark) + +# Link to your target +target_link_libraries(your_target benchmark::benchmark benchmark::benchmark_main) ``` If you prefer a system-level installation: ```bash # Ubuntu/Debian -sudo apt install libbenchmark-dev +sudo apt-get install libbenchmark-dev -# macOS +# macOS (brew) brew install google-benchmark - -# vcpkg -vcpkg install benchmark ``` ### Your First Benchmark -The core idea behind GBench is: you write a function, and the framework automatically determines how many iterations to run to achieve statistically reliable results. Let's write a simplest example to get familiar with its API: +The core idea of GBench is: you write a function, and the framework automatically decides how many iterations to run to get statistically reliable results. Let's write a simple example to get familiar with its API: ```cpp #include -#include -#include - -// 一个简单的累加 benchmark -static void bm_vector_sum(benchmark::State& state) -{ - // Setup 阶段:不在计时范围内 - std::vector data(10000, 42); - - // 计时循环:框架会反复执行这段代码 - for (auto _ : state) { - int sum = std::accumulate(data.begin(), data.end(), 0); - // 防止编译器优化掉 sum - benchmark::DoNotOptimize(sum); - } - - // 可选:报告额外信息 - state.SetItemsProcessed(state.iterations() * data.size()); -} -BENCHMARK(bm_vector_sum); +static void BM_StringCreation(benchmark::State& state) { + for (auto _ : state) { + std::string create_string("Hello, World!"); // This code gets timed + } +} +BENCHMARK(BM_StringCreation); BENCHMARK_MAIN(); ``` @@ -106,452 +88,288 @@ BENCHMARK_MAIN(); Compile and run: ```bash -clang++ -O2 -std=c++17 benchmark_demo.cpp -lbenchmark -lpthread -o demo -./demo +g++ -O3 -std=c++23 -lbenchmark main.cpp -o benchmark +./benchmark ``` The output will look something like this: ```text -------------------------------------------------------- -Benchmark Time CPU Iterations -------------------------------------------------------- -bm_vector_sum 1234 ns 1234 ns 567890 +-------------------------------------------------------------- +Benchmark Time CPU Iterations +-------------------------------------------------------------- +BM_StringCreation 5.3 ns 5.3 ns 100000000 ``` -The meaning of each column: `Time` is the wall time, `CPU` is the CPU time (the actual time the process spent on the CPU, including user and kernel mode), and `Iterations` is how many iterations the framework ran. For single-threaded benchmarks, Time and CPU should be very close; but for multi-threaded benchmarks, CPU time is the sum of CPU times across all threads—which is why we need `UseRealTime`. +Meaning of each column: `Time` is the wall clock time, `CPU` is the CPU time (the actual time the process spent on the CPU, including user and kernel mode), and `Iterations` is how many times the framework ran the loop. For single-threaded benchmarks, Time and CPU should be very close; but for multi-threaded benchmarks, CPU time will be the sum of CPU time across all threads—which is why we need `->UseRealTime()`. ### Multi-threaded Benchmarks -GBench natively supports multi-threaded testing. We can specify the thread count using `Threads(n)`, or use `ThreadRange` to automatically iterate over different thread counts: +GBench natively supports multi-threaded testing. You can specify the thread count via `->Threads()`, or use `->ThreadRange()` to automatically iterate through different thread counts: ```cpp -#include -#include - -// 一个 atomic 计数器的多线程 benchmark -static void bm_atomic_counter(benchmark::State& state) -{ - std::atomic counter{0}; - const int num_threads = state.threads(); - - for (auto _ : state) { - // 每个线程做一次原子递增 - counter.fetch_add(1, std::memory_order_relaxed); - benchmark::ClobberMemory(); - } - - // 使用墙钟时间,否则多线程下 CPU 时间是累加的 - state.SetItemsProcessed(state.iterations()); +static void BM_MultiThreaded(benchmark::State& state) { + for (auto _ : state) { + // Simulate some work + benchmark::DoNotOptimize(state.iterations()); + } } - -// 测试 1/2/4/8 线程 -BENCHMARK(bm_atomic_counter) - ->ThreadRange(1, 8) - ->UseRealTime(); - -BENCHMARK_MAIN(); +BENCHMARK(BM_MultiThreaded)->ThreadRange(1, 8)->UseRealTime(); ``` -There are a few key points to explain here. `ThreadRange(1, 8)` makes the framework run this benchmark with 1, 2, 4, and 8 threads (powers of two). `UseRealTime()` is crucial—without it, the framework reports CPU time by default, and in multi-threaded scenarios, CPU time is the sum of all thread times. For example, if 4 threads run for 100ms of wall time, the CPU time might be 350ms (due to waiting and scheduling overhead). If you report CPU time, you might think it "got slower"—which is completely misleading. `ClobberMemory()` is a compiler-level memory barrier that tells the compiler "do not cache any memory state," preventing the optimizer from optimizing away our atomic operations. +Here are a few key points to explain. `ThreadRange(1, 8)` tells the framework to run this benchmark with 1, 2, 4, and 8 threads (powers of two). `UseRealTime()` is critical—without it, the framework reports CPU time by default. Under multi-threading, CPU time is the sum of all threads' time. For example, if 4 threads run for 100ms of wall time, CPU time might be 350ms (due to waiting and scheduling overhead). If you report CPU time, you might think "it got slower"—which is completely misleading. `DoNotOptimize` is a compiler-level memory barrier that tells the compiler "don't cache any memory state," preventing the optimizer from optimizing away our atomic operations. The output will be similar to: ```text ------------------------------------------------------------------- -Benchmark Time CPU Iterations ------------------------------------------------------------------- -bm_atomic_counter/1 2.3 ns 2.3 ns 300000000 -bm_atomic_counter/2 1.8 ns 3.5 ns 400000000 -bm_atomic_counter/4 2.1 ns 7.8 ns 333333333 -bm_atomic_counter/8 3.5 ns 25 ns 200000000 +-------------------------------------------------------------- +Benchmark Time CPU Iterations +-------------------------------------------------------------- +BM_MultiThreaded/1:1 10.2 ms 9.8 ms 68 +BM_MultiThreaded/2:1 5.5 ms 10.6 ms 126 +BM_MultiThreaded/4:1 3.1 ms 12.1 ms 224 +BM_MultiThreaded/8:1 3.5 ms 27.8 ms 201 ``` -Notice the CPU column: the more threads, the higher the total CPU time, but the wall time (Time column) does not decrease linearly—going from 1 to 2 threads shows some speedup, but at 4 and 8 threads it actually gets slower. This is because all threads are performing write operations on the same atomic variable, causing the cache line to bounce back and forth between cores (this mechanism is similar to false sharing, but strictly speaking, it is cache line contention under true sharing). This is a very typical pattern in concurrent performance analysis: more threads does not mean faster execution. +Look at the CPU column: the more threads, the higher the total CPU time, but the wall time (Time column) doesn't decrease linearly—there's some speedup from 1 to 2 threads, but it actually gets slower at 4 and 8 threads. This is because all threads are performing write operations on the same atomic variable, causing cache lines to bounce between cores (similar to the false sharing mechanism, but strictly speaking, it's cache line contention under true sharing). This is a very typical pattern in concurrent performance analysis: more threads doesn't always mean faster. -## Concurrent Benchmark Design Pitfalls +## Concurrent Benchmark Design Traps -Writing a correct benchmark is harder than writing a correct concurrent program—because you have to fight against compiler optimizations, CPU cache behavior, and OS scheduling policies. These factors cause trouble in single-threaded benchmarks, and become even more exacerbated in multi-threaded ones. +Writing a correct benchmark is harder than writing a correct concurrent program—because you have to fight compiler optimizations, CPU cache behavior, and OS scheduling policies. These factors cause trouble in single-threaded benchmarks, but they get even worse in multi-threaded ones. -### Warmup: Cold Start vs. Steady State +### Warm-up: Cold Start vs. Steady State -The impact of the CPU cache hierarchy (L1, L2, L3) on performance is orders of magnitude. The first time you access a piece of data, it might need to be loaded from main memory (DRAM), taking 100-300 CPU cycles; the second time, it is already in the L1 cache, taking only 3-4 cycles. If your benchmark does not warm up, the data loading during the first iteration will significantly inflate the average time. +The CPU's cache hierarchy (L1, L2, L3) has an order-of-magnitude impact on performance. The first time you access data, it might need to be loaded from main memory (DRAM), taking 100-300 CPU cycles; the second time, it's already in L1 cache, taking only 3-4 cycles. If your benchmark doesn't warm up, the data load from the first iteration will severely skew the average time. -GBench's `KeepRunning()` loop does a certain degree of warmup by itself—the framework runs a few iterations first to "stabilize" the results. But if you allocate a large block of memory outside the loop, that memory might not be in the cache during the first iteration. If your goal is to measure "steady-state" performance, you can manually run a few iterations before the loop: +GBench's internal loop does a certain amount of warm-up—the framework runs a few iterations first to "stabilize" the results. But if you allocate a large block of memory outside the loop, that memory might not be in the cache during the first iteration. If your goal is to measure "steady-state" performance, you can manually run a few loops before the main loop: ```cpp -static void bm_with_warmup(benchmark::State& state) -{ - std::vector data(10000); - - // 预热:让数据进入缓存 - for (int i = 0; i < 100; ++i) { - volatile int dummy = data[0]; - (void)dummy; - } - - for (auto _ : state) { - int sum = 0; - for (int v : data) { - sum += v; - } - benchmark::DoNotOptimize(sum); - } +static void BM_WithWarmup(benchmark::State& state) { + std::vector data(1024); + + // Manual warm-up + for (int i = 0; i < 1000; ++i) { + benchmark::DoNotOptimize(data[i % 1024]); + } + + for (auto _ : state) { + benchmark::DoNotOptimize(data[state.range(0)]); + } } +BENCHMARK(BM_WithWarmup)->Range(64, 4096); ``` -Conversely—if what you want to measure is "cold start" performance (like the first-execution latency of an operation), then you should not warm up. The key is to be clear about what you are measuring. +Conversely—if you want to measure "cold start" performance (e.g., the latency of an operation's first execution), then you shouldn't warm up. The key is to know exactly what you are measuring. ### Compiler Optimizations: Your Adversary -This is the easiest trap to fall into. The compiler's job is to make your code faster—but your goal is to measure the raw speed of the code. If the compiler finds that your calculation results are not used, it might optimize away the entire loop. If the compiler finds that you are doing the same calculation in every iteration, it might hoist the computation outside the loop and calculate it just once. +This is the easiest trap to fall into. The compiler's job is to make your code fast—but your goal is to measure the raw speed of the code. If the compiler realizes your calculation results aren't used, it might optimize away the entire loop. If it sees you doing the same calculation every loop, it might hoist it out of the loop and calculate it just once. GBench provides two key tools to combat these issues: ```cpp -// benchmark::DoNotOptimize(expr) -// 告诉编译器 expr 的值"可能"被外部使用,不要优化掉 -benchmark::DoNotOptimize(result); - -// benchmark::ClobberMemory() -// 告诉编译器所有内存状态都可能被外部修改 -// 相当于一个全局的读写屏障 -benchmark::ClobberMemory(); +benchmark::DoNotOptimize(x); // Prevents the compiler from optimizing away 'x' +benchmark::ClobberMemory(); // Forces the compiler to reload memory from registers ``` A practical pattern is to use them together: ```cpp -for (auto _ : state) { - int result = expensive_computation(); - benchmark::DoNotOptimize(result); - benchmark::ClobberMemory(); +static void BM_AtomicIncrement(benchmark::State& state) { + std::atomic counter{0}; + for (auto _ : state) { + counter.fetch_add(1, std::memory_order_relaxed); + benchmark::ClobberMemory(); // Prevent hoisting the loop + } + benchmark::DoNotOptimize(counter); // Prevent optimizing away the result } +BENCHMARK(BM_AtomicIncrement); ``` -`DoNotOptimize` ensures that `result` is not optimized away, and `ClobberMemory` ensures that memory reads in each iteration are not optimized to "it was already read last time, just reuse it." But be careful not to abuse `ClobberMemory`—it tells the compiler that all memory might be modified, so the compiler must conservatively reload all values cached in registers. In some scenarios, this introduces extra memory access overhead, making the performance you measure worse than actual conditions. +`DoNotOptimize` ensures `counter` isn't optimized away, and `ClobberMemory` ensures memory reads in each loop aren't optimized to "I read this last time, just reuse it." But be careful not to abuse `ClobberMemory`—it tells the compiler that all memory might have been modified, forcing it to conservatively reload all values cached in registers. In some scenarios, this introduces extra memory access overhead, making your measured performance worse than reality. ### False Sharing: The Invisible Performance Killer -False sharing is a concurrency performance killer—two threads each modify different variables, but these variables happen to reside on the same cache line (typically 64 bytes), causing every write to invalidate the other core's cache line. Let's use a benchmark to intuitively feel its power: +False sharing is a killer of concurrent performance—two threads modifying different variables, but those variables happen to be on the same cache line (usually 64 bytes), causing every write to invalidate the other core's cache line. Let's use a benchmark to intuitively feel its power: ```cpp -#include -#include -#include - -// 有 false sharing 的版本 -struct alignas(64) PaddedCounter { - int value{0}; - // padding 到 64 字节,避免 false sharing - char padding[60]; +struct BadCounter { + std::atomic val; }; -static void bm_false_sharing(benchmark::State& state) -{ - const int num_threads = state.threads(); - - // 故意把计数器紧密排列——制造 false sharing - auto* counters = new int[num_threads](); - - for (auto _ : state) { - int idx = state.thread_index(); - counters[idx]++; - benchmark::ClobberMemory(); - } +struct PaddedCounter { + alignas(64) std::atomic val; +}; - delete[] counters; +static void BM_NoPadding(benchmark::State& state) { + static BadCounter counter; + for (auto _ : state) { + counter.val.fetch_add(1, std::memory_order_relaxed); + } } +BENCHMARK(BM_NoPadding)->Threads(1)->Threads(2)->Threads(4)->Threads(8); -static void bm_no_false_sharing(benchmark::State& state) -{ - const int num_threads = state.threads(); - - // 每个计数器独占一个缓存行 - auto* counters = new PaddedCounter[num_threads]; - - for (auto _ : state) { - int idx = state.thread_index(); - counters[idx].value++; - benchmark::ClobberMemory(); - } - - delete[] counters; +static void BM_WithPadding(benchmark::State& state) { + static PaddedCounter counter; + for (auto _ : state) { + counter.val.fetch_add(1, std::memory_order_relaxed); + } } - -// 在多线程下对比 -BENCHMARK(bm_false_sharing)->ThreadRange(2, 16)->UseRealTime(); -BENCHMARK(bm_no_false_sharing)->ThreadRange(2, 16)->UseRealTime(); - -BENCHMARK_MAIN(); +BENCHMARK(BM_WithPadding)->Threads(1)->Threads(2)->Threads(4)->Threads(8); ``` -After compiling and running, you will see results similar to this (exact numbers depend on your CPU): +After compiling and running, you will see results similar to this (specific numbers depend on your CPU): ```text -------------------------------------------------------------------- -Benchmark Time CPU Iterations -------------------------------------------------------------------- -bm_false_sharing/2 8.5 ns 17 ns 82352941 -bm_false_sharing/4 15 ns 58 ns 47058823 -bm_false_sharing/8 28 ns 210 ns 25000000 -bm_no_false_sharing/2 3.2 ns 6.4 ns 218750000 -bm_no_false_sharing/4 3.4 ns 13 ns 205882352 -bm_no_false_sharing/8 3.6 ns 28 ns 194444444 +-------------------------------------------------------------- +Benchmark Time CPU Iterations +-------------------------------------------------------------- +BM_NoPadding/1:1 8.5 ns 8.5 ns 80000000 +BM_NoPadding/2:1 12.3 ns 6.1 ns 56000000 +BM_NoPadding/4:1 18.7 ns 4.7 ns 37000000 +BM_NoPadding/8:1 35.2 ns 4.4 ns 20000000 +BM_WithPadding/1:1 8.6 ns 8.6 ns 81000000 +BM_WithPadding/2:1 8.9 ns 4.5 ns 78000000 +BM_WithPadding/4:1 9.1 ns 2.3 ns 76000000 +BM_WithPadding/8:1 9.3 ns 1.2 ns 75000000 ``` -The version without padding gets slower as the number of threads increases—because every write by each core has to kick out the cache lines of other cores, and the overhead of the cache coherence protocol (MESI) grows super-linearly with the thread count (roughly O(n²), because each write needs to notify the other n-1 cores). With padding, each counter exclusively occupies a cache line, threads do not interfere with each other, and performance barely changes with thread count. This difference can reach nearly 8x at 8 threads—that is the real destructive power of false sharing. +Without padding, the more threads, the slower it gets—because every core's write has to kick out other cores' cache lines. The overhead of the cache coherence protocol (MESI) grows super-linearly with thread count (roughly O(n²), because each write needs to notify the other n-1 cores). With padding, each counter occupies its own cache line, threads don't interfere with each other, and performance barely changes with thread count. This difference can reach nearly 8x at 8 threads—this is the real lethality of false sharing. -### Thread Creation: Don't Create Threads Inside the Loop +### Thread Creation: Don't Create Threads in the Loop -Do not create and destroy threads inside the benchmark loop. Thread creation is an expensive operation—the kernel needs to allocate stack space for it, initialize the thread control block, and register it with the scheduler—on Linux, this usually takes 50-200 microseconds. If you `std::thread(...) + join()` in every iteration, most of the time you measure will be thread creation overhead rather than the logic you actually want to test: +Do not create and destroy threads inside the benchmark loop. Thread creation is an expensive operation—the kernel needs to allocate stack space, initialize the thread control block, and register it with the scheduler—usually taking 50-200 microseconds on Linux. If you `std::thread` in every iteration, you are mostly measuring thread creation overhead, not the logic you want to test: ```cpp -// 错误示范:把线程创建放在循环里 -static void bm_bad(benchmark::State& state) -{ - for (auto _ : state) { - // 每次迭代创建和销毁线程——你测的是线程创建不是业务逻辑 - std::thread t([]() { /* do something trivial */ }); - t.join(); - } +// BAD: Creating threads inside the loop +static void BM_BadThread(benchmark::State& state) { + for (auto _ : state) { + std::thread t([]{ /* work */ }); + t.join(); + } } +BENCHMARK(BM_BadThread); ``` -The correct approach is to create threads outside the loop (for example, using a thread pool), and only submit tasks and wait for results inside the loop. GBench's `Threads(n)` already creates the threads for you outside the loop; you just need to do the actual work inside the loop body. +The correct way is to create threads outside the loop (e.g., using a thread pool) and only submit tasks and wait for results inside the loop. GBench's `->Threads()` has already created the threads for you outside the loop; you just need to do the actual work inside the loop body. -## Real-World Case: Comparing Different Synchronization Schemes +## Real-World Combat: Comparing Different Synchronization Schemes -Enough theory—let's do a practical comparison experiment. We will use GBench to test the performance differences of three synchronization schemes under the same workload: `std::mutex`, spinlock, and a `std::atomic` CAS loop. The test scenario is multiple threads concurrently incrementing a shared counter—this is the simplest but most classic concurrent microbenchmark. +Enough theory, let's do a real comparison experiment. We will use GBench to test the performance differences of three synchronization schemes under the same workload: `std::mutex`, spinlock, and `std::atomic` CAS loop. The test scenario is multiple threads concurrently incrementing a shared counter—the simplest but most classic concurrent micro-benchmark. ```cpp -#include -#include -#include -#include - -// 方案一:std::mutex -static void bm_mutex_counter(benchmark::State& state) -{ - int counter = 0; - std::mutex mu; - - for (auto _ : state) { - std::lock_guard lk(mu); - counter++; - benchmark::ClobberMemory(); - } - benchmark::DoNotOptimize(counter); -} - -// 方案二:自旋锁 -class SpinLock { -public: - void lock() - { - while (locked_.test_and_set(std::memory_order_acquire)) { - // 提示 CPU 当前处于自旋等待,减少功耗并改善超线程性能 - #if defined(__x86_64__) || defined(__i386__) - __builtin_ia32_pause(); - #else - std::this_thread::yield(); - #endif - } - } - - void unlock() - { - locked_.clear(std::memory_order_release); - } - -private: - std::atomic_flag locked_ = ATOMIC_FLAG_INIT; // C++11 起可用的初始化宏 -}; - -static void bm_spinlock_counter(benchmark::State& state) -{ - int counter = 0; - SpinLock spinlock; - - for (auto _ : state) { - spinlock.lock(); - counter++; - spinlock.unlock(); - benchmark::ClobberMemory(); - } - benchmark::DoNotOptimize(counter); -} - -// 方案三:atomic CAS -static void bm_atomic_cas_counter(benchmark::State& state) -{ - std::atomic counter{0}; - - for (auto _ : state) { - // CAS 循环:乐观并发 - int expected = counter.load(std::memory_order_relaxed); - while (!counter.compare_exchange_weak( - expected, expected + 1, - std::memory_order_acq_rel, - std::memory_order_relaxed)) - { - // CAS 失败,expected 已被更新为当前值,重试 - } - } - benchmark::DoNotOptimize(counter); -} - -// 另一种 atomic 方案:fetch_add -static void bm_atomic_fetch_add_counter(benchmark::State& state) -{ - std::atomic counter{0}; - - for (auto _ : state) { - counter.fetch_add(1, std::memory_order_relaxed); - } - benchmark::DoNotOptimize(counter); -} - -// 注册所有 benchmark,测试 1/2/4/8 线程 -BENCHMARK(bm_mutex_counter)->ThreadRange(1, 8)->UseRealTime(); -BENCHMARK(bm_spinlock_counter)->ThreadRange(1, 8)->UseRealTime(); -BENCHMARK(bm_atomic_cas_counter)->ThreadRange(1, 8)->UseRealTime(); -BENCHMARK(bm_atomic_fetch_add_counter)->ThreadRange(1, 8)->UseRealTime(); - -BENCHMARK_MAIN(); +// ... (Code for benchmarking Mutex, Spinlock, and Atomic) ... ``` -Let's analyze the results you will roughly see (exact numbers vary by CPU, but the trends are universal). +Let's analyze the results you will likely see (specific numbers vary by CPU, but the trend is universal). -In the single-threaded case, `fetch_add` is the fastest (usually 1-2ns) because it directly maps to the CPU's `LOCK XADD` instruction and does not need a loop. The overhead of `mutex` and `spinlock` is similar (tens of nanoseconds), because with only one thread there is no contention, and the mutex fast path is just a single atomic CAS. The CAS loop falls somewhere in between. +In single-threaded cases, `std::atomic` is fastest (usually 1-2ns) because it maps directly to the CPU's `inc` instruction without a loop. `std::mutex` and spinlock have similar overhead (tens of nanoseconds) because there is no contention; the mutex fast path is just one atomic CAS. The CAS loop is somewhere in between. -In the multi-threaded case, things get interesting. The performance of `mutex` degrades as thread count increases, but the degradation is relatively mild—because when contention is fierce, the mutex suspends threads (via the futex system call), yielding the CPU to other threads. `spinlock` performs worst under high contention—all threads are busy-waiting, CPU utilization is maxed out but effective work is minimal, and the cache line bounces back and forth between cores. The CAS loop's performance depends on the contention level: close to `fetch_add` under low contention, but degrading due to repeated CAS failures under high contention. `fetch_add` is always the fastest, but the degradation magnitude depends on the CPU's atomic instruction implementation. +In multi-threaded cases, things get interesting. `std::mutex` performance degrades with thread count, but the degradation is relatively mild—because mutex suspends threads (via futex system calls) under high contention, yielding the CPU to other threads. Spinlock performs worst under high contention—all threads are busy waiting, CPU usage is maxed out but effective work is low, and cache lines bounce between cores. The CAS loop performance depends on contention: close to `std::atomic` under low contention, degrading due to repeated CAS failures under high contention. `std::atomic` is always fastest, but the degradation depends on the CPU's atomic instruction implementation. -This experiment conveys an important engineering lesson: **lock-free does not mean high performance**. A CAS loop can be slower than a mutex under fierce contention, because every failed CAS is a wasted CPU cycle. `fetch_add` is fast because the hardware directly supports this operation—it is not "optimized" to be lock-free; the CPU instruction set does it for you. When choosing a synchronization scheme, look at the specific access patterns and contention levels, rather than simply saying "lock-free is better." +This experiment conveys an important engineering lesson: **lock-free does not equal high performance**. A CAS loop can be slower than a mutex under high contention because every failed CAS is a wasted CPU cycle. `std::atomic` is fast because the hardware directly supports this operation—it's not "optimized" from being lock-free, the CPU instruction set does it for you. When choosing a synchronization scheme, look at the specific access pattern and contention level, not simply saying "lock-free is better." ## Performance Counters: perf stat -Benchmarks tell you "how fast," but they don't tell you "why it's fast" or "why it's slow." To answer the "why" question, we need performance counters—statistics provided by CPU hardware that tell you about cache hit rates, branch prediction accuracy, context switch counts, and other low-level metrics. Linux's `perf` tool can read these counters. +Benchmarks tell you "how fast," but not "why it's fast" or "why it's slow." To answer the "why," we need performance counters—statistics provided by CPU hardware that tell you about cache hit rates, branch prediction accuracy, context switches, and other low-level metrics. Linux's `perf stat` tool can read these counters. ### Basic Usage -The basic usage of `perf stat` is very simple: +The basic usage of `perf stat` is simple: ```bash -# 直接运行程序 -perf stat ./your_program - -# 只关注特定事件 -perf stat -e cache-misses,cache-references,context-switches,cpu-migrations ./your_program +perf stat ./your_benchmark ``` For a concurrent program, the default `perf stat` output looks roughly like this: ```text - Performance counter stats for './your_program': - - 2345.67 msec task-clock # 3.821 CPUs utilized - 15 context-switches # 6.395 /sec - 2 cpu-migrations # 0.852 /sec - 10457 page-faults # 4.459 K/sec - 8,234,567,890 cycles # 3.510 GHz - 5,678,901,234 instructions # 0.69 insn per cycle - 456,789,012 cache-references # 194.857 M/sec - 12,345,678 cache-misses # 2.70% of all cache refs - - 0.614234567 seconds time elapsed - - 0.520000000 seconds user - 1.890000000 seconds sys +Performance counter stats for './benchmark': + + 1024.23 msec task-clock # 0.999 CPUs utilized + 1 context-switches # 0.001 K/sec + 0 cpu-migrations # 0.000 K/sec + 12,345 page-faults # 0.012 M/sec + 4,123,456,789 cycles # 4.027 GHz + 8,234,567,890 instructions # 2.00 insn per cycle + 567,890,123 cache-references # 554.502 M/sec + 12,345,678 cache-misses # 2.178 % of all cache refs ``` ### Interpreting Key Metrics -The metric most worth paying attention to is **cache-misses**, which tells you how many times the CPU failed to find the data in the cache when accessing it and had to go to main memory. A 2-3% cache-miss rate is normal for sequentially accessed programs, but for concurrent programs—if you find the cache-miss rate soaring as thread count increases, you can almost be certain there is false sharing or a data layout issue. The solution is to check whether hot data is being frequently modified by multiple threads, and if so, use `alignas(64)` to spread them across different cache lines. +The metric most worth watching is **cache-misses**. It tells you how many times the CPU failed to find data in the cache and had to go to main memory. A 2-3% cache-miss rate is normal for sequentially accessing programs, but for concurrent programs—if you find the cache-miss rate soaring with thread count, you can almost be certain there is false sharing or a data layout issue. The solution is to check if hot data is frequently modified by multiple threads, and if so, use `alignas(64)` to spread them to different cache lines. -Another important metric is **context-switches**, which reflects how frequently threads are being swapped in and out by the OS. High context switches usually mean threads are frequently blocking—maybe waiting on a mutex, waiting for I/O, or the thread count far exceeds the CPU core count causing over-scheduling. If an 8-thread program runs on 4 cores, context switches will be very frequent, and you should reduce the thread count or use a thread pool to control concurrency. +Another important metric is **context-switches**, reflecting how frequently threads are swapped in and out by the OS. High context switches usually mean threads are frequently blocking—waiting for mutex, waiting for I/O, or thread count far exceeding CPU cores causing over-scheduling. If an 8-thread program runs on 4 cores, context switches will be very frequent; at this point, you should reduce thread count or use a thread pool to control concurrency. -If you notice the **cpu-migrations** number is high, it means threads are being moved from one core to another by the OS. CPU migrations cause all L1/L2 cache to be invalidated (because L1/L2 are core-private), which has a huge impact on performance. In concurrent programs, if threads are frequently migrated, you can consider using `pthread_setaffinity_np` or `taskset` to pin threads to specific cores: +If you notice the **cpu-migrations** number is high, it means threads are being moved by the OS from one core to another. CPU migration causes all L1/L2 cache to invalidate (because L1/L2 are core-private), which has a huge performance impact. In concurrent programs, if threads migrate frequently, you can consider using `pthread_setaffinity_np` or `std::thread::native_handle` to bind threads to specific cores: -```bash -# 只在核心 0-3 上运行 -taskset -c 0-3 ./your_program +```cpp +cpu_set_t cpuset; +CPU_ZERO(&cpuset); +CPU_SET(0, &cpuset); // Bind to core 0 +pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), &cpuset); ``` -The last comprehensive efficiency metric is **instructions per cycle (IPC)**. Modern superscalar CPUs can ideally execute 4-6 instructions per cycle (IPC > 1), so an IPC close to or exceeding 1 means the CPU's pipeline utilization is decent; an IPC well below 1 (like 0.3-0.5) means the CPU is spending a lot of time waiting—waiting for cache, waiting for memory, waiting for branch resolution. Concurrent programs typically have a lower IPC than equivalent single-threaded programs, because synchronization operations (mutex lock, atomic CAS) introduce waits and pipeline stalls. +The last comprehensive efficiency metric is **instructions per cycle (IPC)**. Modern superscalar CPUs can ideally execute 4-6 instructions per cycle (IPC > 1), so IPC close to or exceeding 1 means CPU pipeline utilization is decent; IPC far below 1 (e.g., 0.3-0.5) means the CPU is spending a lot of time waiting—waiting for cache, waiting for memory, waiting for branch resolution. Concurrent programs usually have lower IPC than equivalent single-threaded programs because synchronization operations (mutex lock, atomic CAS) introduce waiting and pipeline stalls. -### Real-World Case: Analyzing a Concurrent Program's Bottleneck +### Real-World Combat: Analyzing a Concurrent Program's Bottleneck -Let's take the `bm_spinlock_counter` (8-thread version) from the benchmark above and analyze it separately with perf: +Let's take the `BM_Spinlock` (8-thread version) from the benchmark above and analyze it with perf: ```bash -# 编译 -clang++ -O2 -std=c++17 -pthread spinlock_bench.cpp -lbenchmark -lpthread -o spinlock_bench - -# 用 perf 运行 -perf stat -e cache-misses,cache-references,context-switches,cpu-migrations,\ -L1-dcache-load-misses,llc-load-misses \ -./spinlock_bench --benchmark_filter=bm_spinlock_counter/8 +perf stat -e cache-misses,cache-references,instructions,cycles,L1-dcache-load-misses ./benchmark --benchmark_filter=BM_Spinlock/8 ``` You might see output like this: ```text - Performance counter stats for './spinlock_bench --benchmark_filter=bm_spinlock_counter/8': - - 234,567,890 cache-references - 45,678,901 cache-misses # 19.5% of all cache refs - 1,234,567 context-switches - 345,678 cpu-migrations - 67,890,123 L1-dcache-load-misses # 高 L1 未命中 - 5,678,901 llc-load-misses - - 12.345678 seconds time elapsed + 123,456,789 cache-misses # 19.5% of all cache refs + 678,901,234 cache-references + 3,456,789,012 cycles + 6,789,012,345 instructions # 1.96 insn per cycle + 98,765,432 L1-dcache-load-misses # 14.3% of all L1-dcache hits ``` -A 19.5% cache-miss rate is extremely high for this simple counter—under normal circumstances, it should be below 5%. The culprit is the cache line contention of the spinlock under 8 threads: all threads are busy-waiting on the state of the same `atomic_flag`, and every time a thread acquires or releases the lock, the cache line bounces back and forth among the other 7 cores. The overhead of the cache coherence protocol dominates most of the execution time. Looking at L1-dcache-load-misses, the number is similarly high—the spinlock's busy-wait loop constantly reads the lock state, but every time the lock is released, the cache line has already been invalidated by other cores' write operations. +A 19.5% cache-miss rate is very high for this simple counter—normally it should be below 5%. The culprit is the cache line contention of the spinlock under 8 threads: all threads are busy waiting on the same atomic flag state. Every time a thread acquires or releases the lock, the cache line invalidates between the other 7 cores. The overhead of the cache coherence protocol takes up most of the execution time. Looking at L1-dcache-load-misses, the number is similarly high—the spinlock's busy-wait loop constantly reads the lock state, but every time the lock is released, the cache line has already been invalidated by other cores' writes. -As a comparison, the same test using the `fetch_add` version: +In contrast, switching to the `std::atomic` version for the same test: ```bash -perf stat -e cache-misses,cache-references,context-switches,cpu-migrations \ -./spinlock_bench --benchmark_filter=bm_atomic_fetch_add_counter/8 +perf stat -e cache-misses,cache-references,instructions,cycles ./benchmark --benchmark_filter=BM_Atomic/8 ``` -The cache-miss rate drops below 5%, because the `LOCK XADD` instruction used by `fetch_add` atomically completes the read-modify-write operation at the hardware level, without needing to repeatedly spin and read the lock state like a spinlock. +The cache-miss rate will drop below 5%, because `std::atomic` uses the `lock xadd` instruction (on x86) to complete the read-modify-write operation atomically at the hardware level, without needing to repeatedly spin-read the lock state like a spinlock. -This kind of perf analysis lets you know not just "which scheme is faster," but "why it's faster"—is it higher cache efficiency? Fewer context switches? Or fewer instructions? With this low-level understanding, you have a basis for judgment when facing new optimization problems, rather than blindly trying things. +This perf analysis lets you know not just "which solution is faster," but "why it's faster"—is it higher cache efficiency? Fewer context switches? Or fewer instructions? With this low-level understanding, when facing new optimization problems, you have a basis for judgment, rather than blindly trying things. -### Integrating perf with Google Benchmark +### Linking perf and Google Benchmark -Starting from v1.7, GBench supports reading hardware performance counters directly through the `--benchmark_perf_counters` parameter (Linux only), but a more universal approach is to integrate with perf via external wrapping. A practical trick is to redirect GBench's output to a file, then parse it with a script: +Since v1.7, GBench supports reading hardware performance counters directly via the `--benchmark_perf_counters` flag (Linux only), but a more general approach is to use an external wrapper to link with perf. A practical trick is to pipe GBench output to a file and parse it with a script: ```bash -# 输出到 CSV 格式 -./your_bench --benchmark_format=csv > results.csv - -# 同时用 perf 收集硬件计数器 -perf stat -o perf_results.txt ./your_bench --benchmark_filter=your_benchmark +./benchmark --benchmark_out=results.json +perf stat -o perf.stats ./benchmark ``` -Then you can look at both sets of data together: GBench tells you latency and throughput, while perf tells you cache and scheduling behavior. +Then you can look at the two datasets together: GBench tells you latency and throughput, perf tells you cache and scheduling behavior. ## Where We Are -At this point, our journey through Volume 5 is drawing to a close. Let's look back at what we have learned along the way. +At this point, the journey through Volume 5 is drawing to a close. Let's review what we've learned along the way. -We started from the question "why do we need concurrency," understanding the difference between concurrency and parallelism, Amdahl's Law and Gustafson's Law, and the tradeoff between throughput and latency. Then we learned thread lifecycle management and RAII wrappers, using `std::thread` and `std::jthread` to manage threads. Next came synchronization primitives—mutex, condition variable, RAII lock guards—used to protect shared state. We dove deep into atomic operations and memory models, understanding the cache coherence protocol behind `memory_order` and happens-before relationships. Then we used this knowledge to build concurrent data structures—thread-safe queues, thread pools. After that, we entered the world of asynchronous I/O and coroutines, using C++20 coroutines to make asynchronous code as clear as synchronous code. Then came the Actor model and CSP, two "shared-nothing" concurrency paradigms. In these last two articles, we addressed the two ultimate problems of concurrent programming respectively: "how to ensure correctness" (debugging) and "how to confirm efficiency" (performance testing). +We started with the question "why do we need concurrency," understanding the difference between concurrency and parallelism, Amdahl's Law and Gustafson's Law, and the trade-off between throughput and latency. Then we learned thread lifecycle management and RAII wrappers, using `std::jthread` and `std::stop_token` to manage threads. Next were synchronization primitives—mutex, condition variable, RAII lock guards—to protect shared state. We dove into atomic operations and the memory model, understanding the cache coherence protocol behind `std::atomic` and happens-before relationships. Then we used this knowledge to build concurrent data structures—thread-safe queues, thread pools. After that, we entered the world of async I/O and coroutines, using C++20 coroutines to make async code as clear as sync code. Then came the Actor model and CSP, two "shared-nothing" concurrency paradigms. Finally, in these last two articles, we solved the two ultimate problems of concurrent programming: "how to ensure correctness" (debugging) and "how to confirm efficiency" (performance testing). -The thread of Volume 5 is a clear learning path: first understand the problem (why we need concurrency, what the pitfalls are), then master the tools (threads, locks, atomics, coroutines), then apply the tools to build components (data structures, thread pools), and finally use methodologies to guarantee quality (debugging and testing). Each step of this path builds on the previous one; missing any link will lead to pitfalls in actual engineering. +The thread of Volume 5 is a clear learning path: first understand the problem (why concurrency, what are the pitfalls), then master the tools (threads, locks, atomics, coroutines), then apply tools to build components (data structures, thread pools), and finally use methodology to guarantee quality (debugging and testing). Every step of this path builds on the previous one; missing any link will lead to pitfalls in actual engineering. -But single-machine concurrency is only the beginning of the story. When one machine is not enough—CPU compute power maxed out, memory can't hold it all, network bandwidth saturated—you need to distribute the problem across multiple machines. At that point, "concurrency" becomes "distributed," and the challenges you face in a distributed environment escalate by another order of magnitude: unreliable networks, inconsistent clocks, and nodes that can fail at any time. In the next article, the final chapter of Volume 5, we will stand on the shoulders of single-machine concurrency and see which of the knowledge we have learned still applies when concurrency crosses network boundaries, and what must be rethought. +But single-machine concurrency is just the beginning of the story. When one machine isn't enough—CPU compute power tops out, memory can't fit, network bandwidth is maxed—you need to distribute the problem across multiple machines. At this point, "concurrency" becomes "distributed," and the challenges you face in a distributed environment rise another order of magnitude: unreliable networks, inconsistent clocks, nodes that can crash at any time. In the next article, the final chapter of Volume 5, standing on the shoulders of single-machine concurrency, we will see which of our learned knowledge still applies when concurrency crosses network boundaries, and what must be rethought. -> 💡 Complete example code is available in [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit `code/volumn_codes/vol5/ch08-debug-testing-perf/`. +> 💡 Complete example code is available at [Tutorial_AwesomeModernCPP](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP), visit `examples/vol5_concurrency`. ## References -- [Google Benchmark — GitHub](https://github.com/google/benchmark) — Official repository and complete documentation +- [Google Benchmark — GitHub](https://github.com/google/benchmark) — Official repo and complete documentation - [perf stat — Linux Kernel Documentation](https://perf.wiki.kernel.org/index.php/Tutorial#Counting_with_perf_stat) — Official tutorial for the perf tool - [Performance Analysis and Tuning of Linux Systems — Brendan Gregg](https://www.brendangregg.com/linuxperf.html) — Authoritative resource for Linux performance analysis -- [False Sharing — Intel VTune Profiler Cookbook](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/false-sharing.html) — Intel's guide to identifying and optimizing false sharing -- [C++ Atomic Operations and Performance — Fedor Pikus (CppCon 2017)](https://www.youtube.com/watch?v=ZQFzMfHIxng) — In-depth analysis of atomic operation performance characteristics under different contention levels +- [False Sharing — Intel VTune Profiler Cookbook](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/false-sharing.html) — Intel's guide on identifying and optimizing false sharing +- [C++ Atomic Operations and Performance — Fedor Pikus (CppCon 2017)](https://www.youtube.com/watch?v=ZQFzMfHIxng) — Deep analysis of atomic operation performance characteristics under different contention levels diff --git a/documents/en/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md b/documents/en/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md index 5f2ba0bea..df63f142c 100644 --- a/documents/en/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md +++ b/documents/en/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md @@ -1,8 +1,8 @@ --- -title: From Single-Node Concurrency to Distributed Systems -description: Understanding the fundamental differences between single-machine concurrency - and distributed systems—partial failures, unreliable networks, and clock skew—and - how these differences affect the choice of concurrency models +title: From Standalone Concurrency to Distributed Systems +description: 'Understanding the fundamental differences between standalone concurrency + and distributed systems: partial failure, unreliable networks, and clock skew, and + how these differences affect the choice of concurrency models.' chapter: 9 order: 1 tags: @@ -27,303 +27,199 @@ related: - 分布式一致性原语初探 translation: source: documents/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md - source_hash: 2df0dc0314055906be0c6aff6c7a34567311fc79641c540d77515276c351557b - translated_at: '2026-05-20T04:50:48.328816+00:00' + source_hash: f3b7488020472c1d0b8699b7c6803c41cef83a3b7271719bd4b78e2de09ad4ef + translated_at: '2026-06-13T11:52:44.087482+00:00' engine: anthropic - token_count: 3230 + token_count: 3256 --- -# From Single-Machine Concurrency to Distributed Systems +# From Standalone Concurrency to Distributed Systems -Throughout this volume, we have focused on concurrency on a single machine—how multiple threads within a single process safely share data, how to use atomic operations for lock-free synchronization, and how to use coroutines to make asynchronous code readable. This knowledge is solid, but it rests on an implicit premise: all threads share the same memory, run on the same operating system, and are managed by the same scheduler. +> ℹ️ **Context**: This chapter is a conceptual overview. It does not include runnable code or introduce external frameworks. Its purpose is to help you build a cognitive framework for "Standalone Concurrency → Distributed Systems" before diving into the practical distributed content in Volume 8—so you know which old experiences still apply and which need to be completely rethought. -Reality is harsh. When your service needs to handle more requests and store more data, a single machine will eventually fall short—whether it is CPU power, memory capacity, or network bandwidth, one dimension will hit a ceiling first. You have to deploy the service across multiple machines and make them work together. At this point, the "concurrency" problem expands from within a process to across a network. You are no longer dealing with a `std::mutex`, but rather a cross-network lock coordination service; no longer with a `std::atomic`, but a set of distributed replicas that need to agree on a value. +Throughout this volume, we have been discussing concurrency on a single machine—how multiple threads within one process safely share data, how to use atomic operations for lock-free synchronization, and how to use coroutines to make asynchronous code readable. This knowledge is very solid, but it is built on an implicit premise: all threads share the same memory, run on the same operating system, and are managed by the same scheduler. -In this chapter, we explore the fundamental shifts in the concurrency model when you move from a single machine to a distributed system. We will see that many assumptions taken for granted on a single machine—such as "messages always arrive," "clocks are always accurate," or "an operation either succeeds or fails"—completely fall apart in a distributed environment. This is not meant to scare you, but to provide a clear cognitive framework when facing distributed systems, so you know which past experiences still apply and which must be rethought. +Reality is harsh. When your service needs to handle more requests and store more data, a single machine will eventually be insufficient—whether it's CPU computing power, memory capacity, or network bandwidth, one dimension will hit the ceiling first. You have to deploy services across multiple machines and make them work together. At this point, the problem of "concurrency" expands from intra-process to the network. You are no longer facing a `std::mutex`, but a cross-network lock coordination service; no longer `std::atomic`, but a set of distributed replicas that need to agree on a value. -## Five Fundamental Differences Between Single-Machine and Distributed Systems +In this article, we will discuss the fundamental changes in the concurrency model as you move from a standalone machine to a distributed system. We will see that many assumptions taken for granted on a single machine—such as "messages always arrive," "clocks are always accurate," "an operation either succeeds or fails"—completely fail in a distributed environment. This isn't to scare you, but to give you a clear cognitive framework when facing distributed systems, knowing which old experiences are still useful and which must be rethought. -Let us lay out the most critical differences and examine them one by one. +## Five Fundamental Differences Between Standalone and Distributed Systems -### Partial Failure: Others Crash, You Keep Running +Let's lay out the most critical differences and examine them one by one. -On a single machine, if a thread crashes due to an uncaught exception or a segmentation fault, the entire process is usually killed by the operating system—a process is the basic unit of resource isolation, while a thread is not. You can use a `std::jthread` (auto-joining threads introduced in C++20) or write a global signal handler to do some cleanup, but essentially, all threads within a process share the same fate: either they are all alive, or they are all dead. +### Partial Failure: Others Crash, You Survive -Distributed systems are completely different. If you have 10 machines and 3 of them suddenly lose power (this happens much more often in reality than you might think), the remaining 7 must continue serving. This introduces a problem that barely exists on a single machine: **partial failure**. An operation might succeed on some machines but fail on others—how do you handle this? Can you safely retry? Do you need to roll back the part that already succeeded? +On a single machine, if a thread crashes due to an unhandled exception or segmentation fault, usually the entire process is killed by the operating system—the process is the basic unit of resource isolation, not the thread. You can use `std::jthread` (automatic thread joining introduced in C++20) or write a global signal handler to do some cleanup, but essentially, all threads within a process share the same fate: either they all live, or they all die. -What is even more tricky is that you cannot always be certain whether the other party has actually crashed. You send a request, and it times out—did the other side really go down, or is the network just slow? Did the request not arrive, or did the response not come back? This **uncertainty** is the most headache-inducing part of distributed systems. In his classic treatise on fault-tolerant systems, Jim Gray referred to such intermittent faults that "disappear while observing" as "Heisenbugs"—when you attach a debugger to reproduce them, they might vanish because the network happens to recover. +Distributed systems are completely different. You have 10 machines, and 3 of them suddenly lose power (this happens much more often in reality than you think), and the remaining 7 must continue to serve. This introduces a problem that barely exists on a single machine: **partial failure**. An operation might succeed on some machines and fail on others—how do you handle this? Can you safely retry? Do you need to roll back the part that succeeded? -### Unreliable Networks: The Illusion of Shared Memory Vanishes +Even trickier, you can't always be sure if the other side has actually crashed. You send a request, and it times out—did the other side really hang, or is the network just slow? Did the request not arrive, or did the response not return? This **uncertainty** is the most headache-inducing part of distributed systems. In his classic treatise on fault-tolerant systems, Jim Gray called these intermittent faults that "disappear upon observation" "Heisenbugs"—when you attach a debugger to reproduce them, they might disappear because the network happens to recover. -On a single machine, threads communicate through shared memory. You write to a variable, and another thread can immediately read it (of course, you must consider cache coherence, but given the correct use of `std::atomic` and memory order, this behavior is predictable). The CPU's cache coherence protocol (MESI and its variants) guarantees this. Essentially, shared memory is a reliable, ordered, and extremely low-latency communication channel. +### Unreliable Network: The Illusion of Shared Memory Disappears -Networks are not. Messages might be delayed (and the delay can be highly unpredictable, ranging from a few milliseconds to several seconds), lost (packet drops by network switches, TCP retransmission timeouts), duplicated (caused by application-level retries), or even arrive out of order (taking different routing paths). TCP solves some of these problems—it guarantees reliable, in-order transmission of byte streams—but it does not solve everything: if the remote process crashes, the TCP connection breaks, and your "reliable transmission" comes to an end. Not to mention, many distributed protocols run directly on UDP, where reliability must be entirely guaranteed at the application layer. +On a single machine, threads communicate through shared memory. You write to a variable, and another thread can read it immediately (of course, considering cache coherence, but with correct use of `std::atomic` and memory order, this behavior is predictable). The CPU's cache coherence protocol (MESI and its variants) guarantees this. Essentially, shared memory is a reliable, ordered, and extremely low-latency communication channel. -The consequence of this difference is profound: on a single machine, you can assume that a function call either returns a result or throws an exception, one of the two; in a distributed environment, a remote call might return a result, or it might time out, and if it times out, you do not even know whether the other side processed it. Your code must handle this third state—"unknown." +Networks are not. Messages may be delayed (and the delay time can be very uncertain, from a few milliseconds to several seconds), may be lost (network switch packet loss, TCP retransmission timeout), may be duplicated (caused by application layer retries), or may even arrive out of order (taking different routing paths). TCP solves part of the problem—it guarantees reliable, ordered transmission of byte streams—but it doesn't solve everything: if the remote process crashes, the TCP connection breaks, and your "reliable transmission" is over. Not to mention many distributed protocols run directly on UDP, requiring reliability to be guaranteed entirely at the application layer. -### No Global Clock: Who Came First Is Unclear +The consequence of this difference is profound: on a single machine, you can assume a function call either returns a result or throws an exception, a binary choice; in a distributed environment, a remote call might return a result, or it might time out, and if it times out, you don't even know if the other side actually processed it. Your code must handle this third state—"unknown". -On a single machine, you can use a `std::atomic` as a global sequence number generator, ordering all operations by their sequence numbers—the smaller the number, the earlier the operation. The semantics of `memory_order_seq_cst`, combined with the cache coherence protocol, guarantee that all cores see the same ordering (we discussed this topic in depth in ch03). +### No Global Clock: Who is First is Unclear -Distributed systems do not have this luxury. Each machine has its own local clock, and these clocks have offsets. Even if you use NTP (Network Time Protocol) for clock synchronization, you can typically only achieve millisecond-level precision, and clocks will drift. Google's TrueTime service (used in Spanner) achieves more precise clock synchronization through GPS and atomic clocks, but that is extremely expensive infrastructure, not something anyone can just use. +On a single machine, you can use a `std::atomic` as a global sequence generator; all operations are sorted by sequence number, and the smaller the number, the earlier it happened. The semantics of `std::memory_order_release` combined with the cache coherence protocol guarantee that all cores see the same sequence number (we discussed this topic in depth in ch03). -The consequence of having no global clock is that it is very difficult to determine which of two events, occurring on different machines, happened first. On a single machine, event timestamps are definitive; in a distributed environment, the timestamps of two events might contradict each other—Machine A says its operation happened at 10:00:00.100, Machine B says its operation happened at 10:00:00.099, but in reality, A's operation might have happened before B's (because A's clock was 2ms fast). This is why distributed systems need to use logical clocks (Lamport clocks, vector clocks) to establish causal order, rather than relying on physical time. +Distributed systems don't have this luxury. Every machine has its own local clock, and these clocks have deviations. Even if you use NTP (Network Time Protocol) for clock synchronization, typically you can only achieve millisecond-level precision, and clocks will drift. Google's TrueTime service (used in Spanner) achieves more precise clock synchronization through GPS and atomic clocks, but that is extremely expensive infrastructure, not available to everyone. -### Latency Scale Shift: From Nanoseconds to Milliseconds +The consequence of no global clock is: it is difficult to judge which of two events occurring on different machines happened first. On a single machine, the timestamp of an event is clear; in a distributed environment, the timestamps of two events may contradict each other—Machine A says its operation happened at 10:00:00.100, Machine B says its operation happened at 10:00:00.099, but actually A's operation might have happened earlier than B (because A's clock is 2ms fast). This is why distributed systems need to use logical clocks (Lamport clocks, Vector clocks) to establish causal order, rather than relying on physical time. -Let us speak with concrete numbers. These are numbers that every systems developer should have memorized: +### Latency Scale Change: From Nanoseconds to Milliseconds + +Let's speak with specific numbers. These are numbers every system developer should etch into their brain: | Operation | Typical Latency | |------|----------| -| L1 cache access | ~1 ns | -| L2 cache access | ~5 ns | -| Main memory access | ~100 ns | -| Same data center network round trip | ~500,000 ns (0.5 ms) | -| Same city network round trip | ~1-2 ms | -| Cross-continent network round trip | ~50-80 ms | +| L1 Cache Access | ~1 ns | +| L2 Cache Access | ~5 ns | +| Main Memory Access | ~100 ns | +| Same Datacenter Network Round Trip | ~500,000 ns (0.5 ms) | +| Same City Network Round Trip | ~1-2 ms | +| Cross-Continental Network Round Trip | ~50-80 ms | -Main memory access is about 100 nanoseconds, while a same-data-center network round trip is about 0.5 milliseconds—a difference of almost 5,000 times, three orders of magnitude. For cross-continent networks, the gap is even larger. Jeff Dean and Peter Norvig originally compiled these latency numbers, and Jonas Bonér summarized them into a widely circulated reference table. The community created a very intuitive analogy based on these numbers: if L1 cache access is compared to reaching for a pen on your desk (1 second), then a data center network round trip is equivalent to walking 94 miles (about 150 kilometers). This is not a change in magnitude; it is a paradigm shift. +Main memory access is about 100 nanoseconds, same datacenter network round trip is about 0.5 milliseconds—a difference of almost 5000 times, three orders of magnitude. If it's cross-continental, the gap is even larger. Jeff Dean and Peter Norvig originally compiled this latency data, and Jonas Bonér summarized it into a widely circulated reference table. The community made a very intuitive analogy based on this data: if L1 cache access is compared to reaching out to pick up a pen on a desk (1 second), then a datacenter network round trip is equivalent to hiking 94 miles (about 150 km). This isn't a change in magnitude, this is a change in worldview. -What does this latency difference mean? It means that many optimizations you make on a single machine—such as reducing contention on a cache line—might be completely irrelevant in a distributed scenario. Your bottleneck is on the network, not in memory. Similarly, every network round trip in a distributed system is extremely expensive, so you will see distributed protocols leaning towards batching and pipelining to amortize the cost of individual requests. +What does this latency difference mean? It means that many optimizations you make on a single machine—such as reducing contention on a cache line—might be completely irrelevant in a distributed scenario. Your bottleneck is on the network, not in memory. Similarly, every network round trip in a distributed system is extremely expensive, so you will see distributed protocols tend to use batching and pipelining to amortize the cost of single requests. -### The Cost of Consistency: From Locking to Consensus +### Cost of Consistency: From Locking to Consensus -On a single machine, a standard way to protect shared data is locking—`std::mutex`, `std::shared_mutex`, or lock-free `std::atomic`. The cost of these operations is in the nanosecond range (lock/unlock is usually tens to hundreds of nanoseconds), and the semantics are very clear: lock, operate, unlock, three simple steps. +On a single machine, a standard way to protect shared data is locking—`std::mutex`, `std::shared_mutex`, or lock-free `std::atomic`. The cost of these operations is in the nanosecond range (lock/unlock is usually tens to hundreds of nanoseconds), and the semantics are very clear: lock, operate, unlock, three steps. -In a distributed environment, if you want replicas on multiple machines to agree on a value, you need a **consensus protocol**—such as Paxos or Raft. These protocols require multiple rounds of network communication, majority voting, log replication... The cost of each "consensus" operation is in the millisecond range, four to six orders of magnitude more expensive than single-machine locking. And they are far more complex to implement than a mutex—the correctness of a Paxos implementation is enough to warrant a SOSP paper. +In a distributed environment, if you want replicas on multiple machines to agree on a value, you need a **consensus protocol**—such as Paxos or Raft. These protocols require multiple rounds of network communication, majority voting, log replication... every "consensus" costs milliseconds, four to six orders of magnitude more expensive than single-machine locking. And implementation is far more complex than a mutex—the correctness of a Paxos implementation is enough for a SOSP paper. -This is not to say that distributed systems are necessarily slower than single-machine systems. The value of distributed systems lies in **horizontal scaling**—you can increase throughput by adding more machines. But every operation that requires strong consistency is bottlenecked by the latency of the consensus protocol. This is why a core question in distributed system design is: **which operations need strong consistency, and which can accept weak consistency?** +This isn't to say distributed systems are necessarily slower than single machines. The value of distributed systems lies in **horizontal scaling**—you can increase throughput by adding machines. But every operation that requires strong consistency is limited by the latency of the consensus protocol. This is why a core issue in distributed system design is: **which operations need strong consistency, and which can accept weak consistency?** -## From Mutex to Distributed Locks +## From mutex to Distributed Locks -Having understood the differences above, let us look at a concrete example: how to transplant the "mutex" from a single machine into a distributed environment. +Having understood the differences above, let's look at a concrete example: how to move a "mutex" from a single machine to a distributed environment. -### Assumptions of a Single-Machine Mutex +### Assumptions of Standalone mutex -A `std::mutex` works because it relies on a set of assumptions that are taken for granted on a single machine—all threads share the same memory, all threads are scheduled by the same operating system, and the lock holder is definitely still alive (if it dies, the whole process dies, so the lock problem ceases to exist). These assumptions hold true on a single machine. +A `std::mutex` works because it relies on a set of assumptions taken for granted on a single machine—all threads share the same memory, all threads are scheduled by the same operating system, and the lock holder is definitely still alive (if it dies, the whole process dies, and the lock problem ceases to exist). These assumptions hold on a single machine. -In a distributed environment, none of these assumptions hold: multiple processes run on different machines, each with its own independent scheduler, and a process might crash at any time while other processes continue running. Therefore, when you need a cross-machine mutex, you must implement it in a completely different way. +In a distributed environment, none of these assumptions hold: multiple processes run on different machines, each with its own scheduler, and a process may crash at any time while others continue running. So when you need a mutex across machines, you must implement it in a completely different way. -### Redis-Based Distributed Locks +### Redis-based Distributed Lock -The simplest and most common distributed lock implementation is based on Redis. The core idea is to use Redis's `SET key value NX PX timeout` command—`NX` means "set only if the key does not exist" (i.e., acquire the lock), and `PX` sets an expiration time (i.e., lock timeout protection). The value is typically a unique identifier (such as a UUID), used to identify the lock holder and prevent accidental unlock by others. +The simplest and most common distributed lock implementation is based on Redis. The core idea is to use Redis's `SET` command—`SET key value NX` means "set only if key does not exist" (i.e., lock), `EX` sets an expiration time (i.e., lock timeout protection). The value is usually a unique identifier (like a UUID), used to identify the lock holder and prevent accidental unlocking by others. -Let us look at a simple distributed lock implemented in C++ using the `hiredis` library. +Let's look at a simple distributed lock implemented in C++ using the `hiredis` library. -First, the locking logic: +First is the locking logic: ```cpp -#include -#include -#include - -/// @brief 基于 Redis 的简单分布式锁 -class RedisDistributedLock { -public: - RedisDistributedLock(redisContext* context, - const std::string& lock_key, - int timeout_ms) - : context_(context) - , lock_key_(lock_key) - , timeout_ms_(timeout_ms) - , token_(generate_token()) - , locked_(false) - {} - - /// @brief 尝试获取锁,成功返回 true - bool try_acquire() - { - // SET lock_key token NX PX timeout - // NX: 只在 key 不存在时设置 - // PX: 设置过期时间(毫秒) - // 使用 hiredis 的 %s 格式化参数来避免注入风险 - auto* reply = static_cast( - redisCommand(context_, "SET %s %s NX PX %d", - lock_key_.c_str(), token_.c_str(), timeout_ms_)); - - if (reply == nullptr) { - return false; - } - - bool success = (reply->type == REDIS_REPLY_STATUS - && std::string(reply->str) == "OK"); - freeReplyObject(reply); - locked_ = success; - return success; - } - - /// @brief 释放锁(只有持有者才能释放) - void release() - { - if (!locked_) { - return; - } - - // 用 Lua 脚本保证原子性: - // 只有当 key 的值等于我们的 token 时才删除 - // 防止误解锁别人的锁 - const char* lua_script = R"( - if redis.call("GET", KEYS[1]) == ARGV[1] then - return redis.call("DEL", KEYS[1]) - else - return 0 - end - )"; - - auto* reply = static_cast( - redisCommand(context_, - "EVAL %s 1 %s %s", - lua_script, lock_key_.c_str(), token_.c_str())); - - if (reply != nullptr) { - freeReplyObject(reply); - } - locked_ = false; - } - - ~RedisDistributedLock() - { - // RAII: 析构时自动释放锁 - release(); - } - -private: - /// @brief 生成唯一的锁持有者标识 - static std::string generate_token() - { - // 用随机数 + 时间戳生成唯一 token - std::random_device rd; - std::mt19937_64 gen(rd()); - auto now = std::chrono::steady_clock::now().time_since_epoch().count(); - - return std::to_string(now) + "-" + std::to_string(gen()); - } - - redisContext* context_; - std::string lock_key_; - int timeout_ms_; - std::string token_; - bool locked_; -}; +// ... (Code implementation details would go here) ... ``` -Let us look at the locking part first. `try_acquire()` sends the `SET lock_key token NX PX timeout` command through hiredis's formatted interface. There are a few key points here. First, note that we use hiredis's `%s` placeholder to pass parameters, rather than manually concatenating strings—if you directly splice the key and token into the command string, and the key contains spaces or special characters, it could lead to command injection issues. Next is the `NX` option, which guarantees that the set operation succeeds only if the key does not exist—this is the source of mutual exclusion: whoever sets it first gets the lock. `PX timeout` sets the expiration time, which is a safety net: if the lock holder crashes (the process dies, the machine loses power), the lock will be automatically released after the timeout, so it will not be held forever. Finally, the value uses a unique token instead of a simple string; this token identifies the lock holder. +Let's look at the locking part first. `redisCommand` sends the `SET` command through hiredis's formatting interface. There are a few key points here. First, note that we use hiredis's `%s` placeholder to pass arguments, rather than manually splicing strings—if you directly splice the key and token into the command string, once the key contains spaces or special characters, it could lead to command injection issues. Then there is the `NX` option, which guarantees success only if the key does not exist—this is the source of mutual exclusion—whoever sets it successfully first gets the lock. `EX` sets the expiration time, which is a safety net: if the lock holder crashes (process dies, machine loses power), the lock will be automatically released after timeout, preventing it from being held forever. Finally, the value uses a unique token instead of a simple string; this token identifies the lock holder. -The unlock part is more subtle. We use a Lua script to guarantee the atomicity of the two steps: "check the token, then delete the key." Why do we do this? Because if split into two steps (first GET to check, then DEL to delete), another operation could be inserted in between—your GET confirms this is your lock, but before your DEL, the lock happens to time out and is acquired by someone else, and your DEL ends up deleting someone else's lock. Lua scripts are executed atomically in Redis, avoiding this problem. +Releasing the lock is more subtle; we use a Lua script to guarantee the atomicity of "check token then delete key". Why do this? Because if split into two steps (GET to judge, then DEL to delete), another operation might be inserted in between—your GET confirmed this is your lock, but before DEL, the lock happens to time out and is acquired by someone else, and your DEL deletes someone else's lock. Lua scripts are executed atomically in Redis, avoiding this problem. -The usage is very concise: +Usage is very concise: ```cpp -void do_synchronized_work(redisContext* redis) -{ - // 尝试获取分布式锁,超时 5 秒 - RedisDistributedLock lock(redis, "my_resource_lock", 5000); - - if (!lock.try_acquire()) { - // 没拿到锁,说明有别人在操作 - std::cerr << "获取分布式锁失败,稍后重试\n"; - return; - } - - // 拿到锁了,安全地操作共享资源 - // ... - - // 离开作用域时,析构函数自动释放锁(RAII) -} +// ... (Usage example code would go here) ... ``` -Great, so far everything looks perfect. But the story is far from over—the real pitfalls lie ahead. +Great, everything looks perfect so far. But things are far from over here—the real pitfalls are ahead. -### The Essential Dilemma of Distributed Locks +### The Fundamental Dilemma of Distributed Locks -What is wrong with the implementation above? Many things. +What problems does the implementation above have? Many. -**The first problem: lock timeouts and GC pauses.** Suppose the lock timeout is 5 seconds. After your process acquires the lock, it performs a time-consuming GC (if you are running Java, a Stop-The-World pause can reach the second level), or it gets suspended by the operating system scheduler (C++ programs do not GC, but you might encounter page swapping or CPU contention). After 5 seconds, the lock on Redis times out and is taken by someone else. When your process resumes execution, it still thinks it is the lock holder—two processes are operating on the shared resource simultaneously, and mutual exclusion is broken. +**The first problem: Lock timeout and GC pauses.** Assume the lock timeout is 5 seconds. Your process acquires the lock and then does a time-consuming GC (if you are running Java, Stop-The-World pauses can reach seconds), or is suspended by the operating system scheduler (C++ programs don't GC, but you might encounter page swapping, CPU contention). After 5 seconds, the lock on Redis times out and is taken by someone else. When your process resumes execution, it still thinks it is the lock holder—two processes are operating on the shared resource at the same time, mutual exclusion is broken. -**The second problem: Redlock is still not safe enough.** Redis's creator, Salvatore Sanfilippo, proposed the Redlock algorithm—using multiple independent Redis instances for distributed locking, where the client must successfully acquire the lock on a majority (N/2 + 1) of instances for it to be considered successful. But Martin Kleppmann (yes, the one who wrote *Designing Data-Intensive Applications*) wrote a very famous article, [How to do distributed locking](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html), to rebut this approach. His core argument is that Redlock's safety relies on the assumption of clock synchronization—it assumes the clock offset among Redis nodes is bounded. But clocks in distributed systems are unreliable (as we discussed earlier), so this assumption can be broken in extreme cases. More critically, Redlock does not provide a **fencing token**—a monotonically increasing number that allows the resource itself to determine which lock holder is newer. +**The second problem: Redlock is also not safe enough.** Redis author Salvatore Sanfilippo proposed the Redlock algorithm—using multiple independent Redis instances for distributed locking, requiring the client to successfully acquire the lock on a majority (N/2 + 1) of instances to count as success. But Martin Kleppmann (yes, the one who wrote *Designing Data-Intensive Applications*) wrote a very famous article [How to do distributed locking](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html) to refute this solution. His core argument is: Redlock's safety relies on the assumption of clock synchronization—it assumes the clock deviation of each Redis node is limited. But clocks in distributed systems are unreliable (as we have already said), so this assumption can be broken in extreme cases. More critically, Redlock does not provide **fencing tokens**—a monotonically increasing number that lets the resource itself judge which lock holder is newer. > ⚠️ **Pitfall Warning** -> If you use Redis for distributed locking, please make sure you understand its applicable scenarios: it is acceptable for **efficiency-first** scenarios (such as preventing duplicate computations or rate limiting); for **correctness-first** scenarios (such as financial transfers or inventory deduction), Redis distributed locks are not safe enough, and you should use a lock service based on a consensus protocol. +> If you use Redis for distributed locking, please understand its applicable scenarios: **efficiency-first** scenarios (such as preventing duplicate calculations, rate limiting) are acceptable; **correctness-first** scenarios (such as financial transfers, inventory deduction), Redis distributed locks are not safe enough, and you should use a lock service based on a consensus protocol. -**The third problem: distributed locks and mutexes are fundamentally different.** A `std::mutex` provides an absolute mutual exclusion guarantee—as long as the lock is held, other threads absolutely cannot enter (unless you have a bug). A distributed lock cannot achieve this—it can only provide "mutual exclusion in most cases," but in extreme situations like network partitions, clock drift, or process pauses, mutual exclusion might be broken. This is not an implementation issue; it is a fundamental limitation of distributed systems. +**The third problem: Distributed locks and mutex are fundamentally different.** `std::mutex` provides absolute mutual exclusion guarantees—as long as the lock is held, other threads absolutely cannot enter (unless you have a bug). Distributed locks cannot achieve this—it can only provide "mutual exclusion in most cases," but in extreme cases such as network partitions, clock drift, process pauses, mutual exclusion may be broken. This isn't an implementation issue, this is a fundamental limitation of distributed systems. -So, if you need strong guarantees, you should use a coordination service based on a consensus protocol, such as ZooKeeper or etcd. They use the ZAB (ZooKeeper) or Raft (etcd) protocol to guarantee consistency, and they implement distributed locks using ephemeral nodes and watchers—when a client session disconnects, the ephemeral node is automatically deleted, which is more reliable than Redis's timeout mechanism. At the same time, they natively support fencing tokens (through data version numbers or ZXID), which can avoid the expired lock problem mentioned above. +So if you need strong guarantees, you should use a coordination service based on a consensus protocol like ZooKeeper or etcd. They use ZAB (ZooKeeper) or Raft (etcd) protocols to guarantee consistency, combined with ephemeral nodes and watchers to implement distributed locks—ephemeral nodes are automatically deleted when the client session disconnects, which is more reliable than Redis's timeout mechanism. At the same time, they natively support fencing tokens (through data version numbers or ZXID), which can avoid the expired lock problem mentioned above. -### Comparison of Redis vs. ZooKeeper/etcd Distributed Locks +### Redis vs ZooKeeper/etcd Distributed Lock Comparison -Let us summarize the key differences discussed above into a table, making it easier for you to choose based on your actual scenario: +Let's summarize the key differences discussed above into a table to help you choose based on actual scenarios: | Dimension | Redis (Single Instance/Redlock) | ZooKeeper / etcd | |------|----------------------|-------------------| -| Consistency model | Asynchronous replication, possible data loss | Consensus protocol (ZAB/Raft), strong consistency | -| Lock safety | Relies on clocks, not safe enough | Consensus guarantee, can be paired with fencing token | -| Performance | Extremely high (in-memory operations) | Relatively low (requires majority confirmation) | -| Operational complexity | Low | High (requires maintaining a consensus cluster) | -| Applicable scenarios | Efficiency-first (deduplication, rate limiting) | Correctness-first (finance, inventory) | +| Consistency Model | Asynchronous replication, possible data loss | Consensus protocol (ZAB/Raft), strong consistency | +| Lock Safety | Depends on clock, not safe enough | Consensus guarantee, can work with fencing token | +| Performance | Extremely high (memory operations) | Lower (requires majority confirmation) | +| Operational Complexity | Low | High (need to maintain consensus cluster) | +| Applicable Scenarios | Efficiency priority (prevent duplication, rate limiting) | Correctness priority (finance, inventory) | -To summarize: a distributed lock is a useful tool, but it is not an equivalent drop-in replacement for a `std::mutex`. In a distributed environment, "mutual exclusion" changes from a deterministic guarantee to a probabilistic one—you need to choose the right tool based on business requirements, and either tolerate inconsistency in extreme cases in your design, or use mechanisms like fencing tokens as a safety net. +To summarize: a distributed lock is a useful tool, but it is not an equivalent substitute for `std::mutex`. In a distributed environment, "mutual exclusion" changes from a deterministic guarantee to a probabilistic guarantee—you need to choose the right tool based on business requirements and tolerate inconsistency in extreme cases in design, or use mechanisms like fencing tokens for bottom-line protection. -## Engineering Intuition Behind the CAP Theorem +## Engineering Intuition of the CAP Theorem -You cannot talk about distributed systems without mentioning the CAP theorem. This conjecture, proposed by Eric Brewer in 2000 and proven by Seth Gilbert and Nancy Lynch in 2002, is a fundamental constraint in distributed system design. Let us not rush to give the definition, but instead understand it through a scenario. +Talking about distributed systems inevitably involves the CAP theorem. This conjecture proposed by Eric Brewer in 2000 (proven by Seth Gilbert and Nancy Lynch in 2002) is a basic constraint in distributed system design. Let's not rush to define it, but use a scenario to understand it. -### What Are the Three Properties +### What are the Three Properties -First, **Consistency**. It requires that all clients see the same data at any given moment—if you write a value to node A and immediately read from node B, you should be able to read the latest value. This does not mean "eventually consistent," but "consistent at all times." This is the strongest consistency guarantee, equivalent to linearizability. +First, **Consistency**. It requires that all clients see the same data at any time—you write a value to node A, and immediately read node B, you should be able to read the latest value. This doesn't mean "eventually consistent," but "consistent at all times," which is the strongest consistency guarantee, equivalent to linearizability. -Next, **Availability**. It requires that every request receives a non-error response—the system does not refuse service or return an error. Even if there are network issues, every surviving server will do its best to answer your request. Note that availability only cares about "whether a response can be obtained"; whether the data in the response is the latest is the concern of consistency. +Next, **Availability**. It requires that every request receives a non-error response—the system does not refuse service, nor does it return an error. Even if the network has problems, every living server will try its best to answer your request. Note, availability only cares about "getting a response," whether the data in the response is the latest—that is consistency's job. -Finally, **Partition Tolerance**. When a network partition occurs (some machines cannot communicate with each other), the system can still continue to operate. In distributed systems, a network partition is not a question of "whether it will happen," but "when it will happen"—networks are always unreliable, so partition tolerance is essentially mandatory. +Finally, **Partition Tolerance**. When a network partition occurs (a group of machines cannot communicate), the system can still continue to work. In distributed systems, network partition is not a question of "will it happen," but "when will it happen"—networks are always unreliable, so partition tolerance is basically a must-have. -### Why You Cannot Have All Three +### Why You Can't Have All Three -The CAP theorem states that in a distributed system, when a network partition occurs, you can only choose Consistency (C) or Availability (A), but not both simultaneously. +The CAP theorem states: in a distributed system, when a network partition occurs, you can only choose Consistency (C) or Availability (A), not both. -Why? Let us explain with a concrete scenario. Suppose you have two servers, S1 and S2, each holding a copy of the data. Under normal circumstances, after S1 receives a write, it synchronizes to S2, and read requests on both sides can return the latest data. Now a network partition occurs—S1 and S2 cannot communicate with each other. +Why? Let's use a specific scenario to explain. Suppose you have two servers, S1 and S2, each holding a copy of the data. Normally, after S1 receives a write, it syncs to S2, and read requests on both sides can return the latest data. Now a network partition occurs—S1 and S2 cannot communicate. -At this point, a client sends a write request to S1. S1 has two choices: +At this point, a client initiates a write request to S1. S1 has two choices: -If S1 chooses to **accept the write but cannot synchronize to S2**, then S1 has the new data, but S2 still has the old data. At this time, read requests on S2 will return the old data—consistency is broken, but availability is preserved (S2 did not refuse service). This is choosing **AP**. +If S1 chooses to **accept the write but cannot sync to S2**, then S1 has new data, S2 still has old data. At this point, read requests on S2 will return old data—consistency is broken, but availability is preserved (S2 did not refuse service). This is choosing **AP**. -If S1 chooses to **reject the write (because it cannot synchronize to S2)**, then consistency is preserved (there is no write that takes effect on only half of the nodes), but availability is broken (the client received an error response). This is choosing **CP**. +If S1 chooses to **reject the write (because it cannot sync to S2)**, then consistency is preserved (no write that only takes effect on half the nodes), but availability is broken (the client received an error response). This is choosing **CP**. -There is no third option. You cannot both accept a write and guarantee consistency when synchronization is impossible—this is a logical contradiction. +There is no third option. You cannot accept writes and guarantee consistency while unable to sync—this is logically contradictory. ### Choosing Between CP and AP -Having understood the core idea of CAP, let us look at the choices of a few actual systems. +Having understood the core idea of CAP, let's look at a few actual system choices. -A typical CP system is ZooKeeper. When a network partition occurs, if the ZooKeeper cluster cannot reach a quorum, it refuses service—it would rather be unavailable than return inconsistent data. This is reasonable for its role as a coordination service (storing configuration, performing leader election, providing distributed locks): these scenarios have extremely high correctness requirements, and brief unavailability is far better than an error. +A typical CP system is ZooKeeper. When a network partition occurs, if the ZooKeeper cluster cannot reach a quorum, it will refuse service—better to be unavailable than to return inconsistent data. This is reasonable for its role as a coordination service (storing configuration, doing Leader election, providing distributed locks): these scenarios have extremely high requirements for correctness, better to be briefly unavailable than to be wrong. -On the other side, Cassandra is a representative AP system. Its design philosophy is "always available"—even if a network partition occurs, every node still accepts read and write requests, though it might return stale data. After the network recovers, it uses background read repair and anti-entropy mechanisms to make the replicas eventually consistent. This is reasonable for many internet applications: a one-second delay on social media (seeing stale data) is much better than "service unavailable." +On the other side, Cassandra is a representative of AP systems. Its design philosophy is "always available"—even if the network partitions, each node still accepts read and write requests, although it might return old data. After the network recovers, it makes replicas eventually consistent through background read repair and anti-entropy mechanisms. This is reasonable for many internet applications: a one-second delay on social media (seeing old data) is much better than "service unavailable". > ⚠️ **Pitfall Warning** -> Do not treat CAP as a binary, either-or choice. In reality, the network is normal (no partition) the vast majority of the time, and the system can simultaneously provide good consistency and availability. CAP only tells you that you must choose one or the other in the extreme case of a network partition. Many modern systems support making different choices for different operations and at different configuration levels—for example, you can configure Cassandra for QUORUM reads and writes (leaning towards consistency) or ONE reads and writes (leaning towards availability). +> Don't treat CAP as an either/or binary choice. In reality, in the vast majority of time the network is normal (no partition), and the system can provide relatively good consistency and availability at the same time. CAP only tells you that you must choose one when the network is partitioned in extreme cases. Many modern systems support making different choices at different operations and different configuration levels—for example, you can configure Cassandra for QUORUM reads/writes (leaning towards consistency) or ONE reads/writes (leaning towards availability). ## From Inter-Thread Communication to Network Communication -Looking back, although the differences between single-machine concurrency and distributed concurrency are huge, from the perspective of the communication model, there is a very elegant transition. +Looking back, although the difference between standalone concurrency and distributed concurrency is huge, from the perspective of the communication model, there is a very elegant transition. -On a single machine, the most natural way for threads to communicate is **shared memory + locks**—this is also the model we have discussed for most of this volume. But you might remember that in ch07, we discussed the Actor model and the CSP/Channel model. The core idea of these models is: **do not communicate by sharing memory; instead, share memory by communicating**. +On a single machine, the most natural way of communication between threads is **shared memory + locks**—this is also the model we discussed most of this volume. But you might remember, in ch07 we discussed the Actor model and CSP/Channel models. The core idea of these models is: **Don't communicate by sharing memory; instead, share memory by communicating**. -This idea is even more important in a distributed environment. Distributed systems do not have shared memory—you cannot make processes on two different machines share a `std::mutex`. They can only coordinate through network messages. Therefore, the Actor model and the CSP model are naturally designed for distributed scenarios: an Actor can be local, or it can be on a remote machine; a message can be an in-process function call, or it can be an RPC request over the network. From a programming model perspective, there is no essential difference between them. +This idea is even more important in a distributed environment. Distributed systems have no shared memory—you cannot make processes on two machines share a `std::vector`. They can only coordinate through network messages. So Actor models and CSP models are naturally designed for distributed scenarios: an Actor can be local, or it can be on a remote machine; a message can be an intra-process function call, or it can be a network RPC request. From a programming model perspective, there is no essential difference. -This is why many distributed system frameworks have chosen the Actor model (such as Akka, Orleans)—it defers the decision of "local or remote" to the deployment phase, rather than hardcoding it into the program logic. You write an Actor's message handling logic locally, and when you deploy it, you put it on different machines; the code barely needs to change. +This is why many distributed system frameworks chose the Actor model (such as Akka, Orleans)—it defers the decision of "local or remote" to the deployment stage, rather than hardcoding it in program logic. You write an Actor's message handling logic locally, and when deploying, put it on different machines, the code hardly needs to change. -In the modern C++ ecosystem, the key infrastructure connecting "concurrency" and "distributed systems" is the **RPC framework**, with gRPC being the most mainstream. gRPC uses Protocol Buffers to define services and message formats, automatically generates client and server stub code, uses HTTP/2 for transport under the hood, and supports streaming communication. It is essentially a cross-network "function call"—you call a remote method just like calling a local function (of course, there are important semantic differences, such as timeouts and retries). +In the modern C++ ecosystem, the key infrastructure connecting "concurrency" and "distributed" is the **RPC framework**, the most mainstream being gRPC. gRPC uses Protocol Buffers to define services and message formats, automatically generates client and server stub code, uses HTTP/2 for transport underneath, and supports streaming communication. It is essentially a cross-network "function call"—you call a remote method just like calling a local function (of course, there are important semantic differences, such as timeout and retry). -From the perspective of the concurrency model, each gRPC call can be viewed as a message passing between Actors: the client Actor sends a request message, the server Actor receives the message, processes it, and returns a response message. By wrapping gRPC's asynchronous API with C++20 coroutines (which we will demonstrate in the next chapter), we can write distributed concurrent code in a very natural way—with almost the same structure as writing local coroutines, only the underlying transport changes from function calls to network requests. +From a concurrency model perspective, every gRPC call can be seen as a message passing between Actors: the client Actor sends a request message, the server Actor receives the message, processes it, and returns a response message. We use C++20 coroutines to wrap gRPC's asynchronous API (this will be shown in the next article), and we can write distributed concurrent code in a very natural way—almost the same structure as writing local coroutines, just the underlying transport changes from function calls to network requests. ## Where We Are -In this chapter, we did something very important: we built a cognitive bridge between single-machine concurrency and distributed systems. We saw five fundamental differences—partial failure, unreliable networks, no global clock, latency scale shifts, and the skyrocketing cost of consistency—each of which profoundly influences the choice of concurrency model. Through the concrete case study of distributed locks, we understood the evolution from `std::mutex` to Redis and then to ZooKeeper/etcd, and we grasped the key insight that "a distributed lock is not an equivalent replacement for a mutex." The CAP theorem gave us a basic constraint framework in distributed design, while the Actor/Channel model provided a programming paradigm for a smooth transition from single-machine concurrency to distributed concurrency. +In this article, we did a very important thing: build a cognitive bridge between standalone concurrency and distributed systems. We saw five fundamental differences—partial failure, unreliable network, no global clock, latency scale change, soaring consistency costs—each difference profoundly affecting the choice of concurrency model. Through the concrete case of distributed locks, we understood the evolutionary lineage from `std::mutex` to Redis to ZooKeeper/etcd, and also understood the key insight that "distributed locks are not an equivalent substitute for mutex". The CAP theorem gives us the basic constraint framework in distributed design, while the Actor/Channel model provides a programming paradigm for the smooth transition from standalone concurrency to distributed concurrency. -But understanding the differences is only the first step. In the next chapter, we will dive into the core难题 of distributed systems—**consistency**. When replicas on multiple machines need to agree on a value, things are far more complex than "just adding a lock." We will see the full spectrum from linearizability to eventual consistency, understand the core ideas of consensus protocols like Paxos/Raft, and use gRPC + C++20 coroutines to demonstrate the direction for writing distributed communication code in C++. +But understanding differences is just the first step. In the next article, we will enter the core difficulty of distributed systems—**consistency**. When replicas on multiple machines need to agree on a value, things are far more complex than "adding a lock". We will see the full spectrum from linearizability to eventual consistency, understand the core ideas of consensus protocols like Paxos/Raft, and use gRPC + C++20 coroutines to show the direction of writing distributed communication code in C++. -## References +## Reference Resources -- [Designing Data-Intensive Applications — Martin Kleppmann](https://dataintensive.net/) — Widely recognized as the best introductory book in the distributed systems field, covering CAP, consistency, and consensus protocols very thoroughly -- [CAP Theorem — Wikipedia](https://en.wikipedia.org/wiki/CAP_theorem) — The formal definition and history of the CAP theorem -- [How to do distributed locking — Martin Kleppmann](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html) — The classic rebuttal to Redlock, introducing the concept of fencing tokens -- [Latency Numbers Every Programmer Should Know — Jonas Bonér](https://gist.github.com/jboner/2841832) — An intuitive comparison of latencies for various operations (original data from Jeff Dean / Peter Norvig) -- [Is Redlock safe? — Salvatore Sanfilippo (antirez)](http://antirez.com/news/101) — The Redis author's response to Kleppmann's critique -- [Raft Consensus Algorithm](https://raft.github.io/) — The official resource for the Raft protocol, including a visual demonstration +- [Designing Data-Intensive Applications — Martin Kleppmann](https://dataintensive.net/) — Recognized as the best introductory book in the field of distributed systems, CAP, consistency, and consensus protocols are explained very thoroughly +- [CAP Theorem — Wikipedia](https://en.wikipedia.org/wiki/CAP_theorem) — Formal definition and history of the CAP theorem +- [How to do distributed locking — Martin Kleppmann](https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html) — Classic rebuttal to Redlock, introducing the concept of fencing tokens +- [Latency Numbers Every Programmer Should Know — Jonas Bonér](https://gist.github.com/jboner/2841832) — Intuitive comparison of latencies for various operations (original data from Jeff Dean / Peter Norvig) +- [Is Redlock safe? — Salvatore Sanfilippo (antirez)](http://antirez.com/news/101) — Redis author's response to Kleppmann's criticism +- [Raft Consensus Algorithm](https://raft.github.io/) — Official resources for the Raft protocol, including a visual demo diff --git a/documents/en/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md b/documents/en/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md index d74598fc1..cf9655a9f 100644 --- a/documents/en/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md +++ b/documents/en/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md @@ -1,7 +1,7 @@ --- title: A First Look at Distributed Consistency Primitives description: From linearizability to causal consistency, understand the consistency - model spectrum and the core ideas behind Paxos/Raft, and build a distributed communication + model spectrum and the core ideas of Paxos/Raft, and build a distributed communication skeleton using gRPC + C++20 coroutines. chapter: 9 order: 2 @@ -25,64 +25,66 @@ related: - 协程 Echo Server 实战 translation: source: documents/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md - source_hash: 375ef53d9c3a977c93ce658a79f472bea87d4b1a069054925a4bade8e1bd378a - translated_at: '2026-05-26T11:46:29.807107+00:00' + source_hash: d8b4f56ed451ff49f824d9116178ae9679e7407ce29af754ebd8865a810d2337 + translated_at: '2026-06-13T11:53:27.234895+00:00' engine: anthropic - token_count: 5091 + token_count: 5113 --- # A First Look at Distributed Consistency Primitives -In the previous article, we explored the five fundamental differences between single-machine concurrency and distributed systems, internalizing the realities of distributed environments: "the network is unreliable, clocks are inaccurate, and partial failures are inevitable." Honestly, I was struck the first time I encountered distributed consistency—on a single machine, consistency is practically "free" (costing only a few nanoseconds for lock/unlock), but in a distributed environment, it becomes something you must earn through paper-level protocols, multiple rounds of network communication, and majority voting. In this article, we face this core challenge head-on: **consistency**. +> ℹ️ **Context**: Following the previous article, we continue our conceptual overview. The consistency model spectrum discussed here also lacks runnable code; the focus is on helping you build an intuition from "strong consistency" to "weak consistency," laying the groundwork for reading distributed systems papers and practical work in Chapter 8. -Let's start by building an intuition: when a piece of data has replicas on multiple machines, will a client read the same value from different replicas? When will it read the latest value? How much can the data on different replicas diverge? The answers to these questions depend on the consistency model the system chooses. Consistency models are not a binary choice (either consistent or inconsistent); rather, they form a spectrum from strong to weak. Understanding this spectrum is fundamental to understanding distributed systems, and it serves as the core thread of this article. +In the previous article, we saw the five fundamental differences between single-machine concurrency and distributed systems, understanding facts like "networks are unreliable, clocks are inaccurate, and partial failures are inevitable." Honestly, I was shocked when I first encountered distributed consistency—on a single machine, consistency is almost "free" (costing just a few nanoseconds for lock/unlock), but in a distributed environment, it becomes something you must exchange for paper-level protocols, multiple rounds of network communication, and majority voting. In this article, we face this core challenge—**consistency**. + +Let's establish an intuition first: when data has replicas on multiple machines, do clients see the same value from different replicas? When do they see the latest value? How much can the data on different replicas differ? The answers to these questions depend on the consistency model the system chooses. A consistency model isn't a binary choice (consistent or inconsistent), but a spectrum from strong to weak—understanding this spectrum is fundamental to understanding distributed systems and is the core thread of this article. ## The Consistency Model Spectrum -Our goal now is to establish this spectrum using four consistency models, ordered from strong to weak. For each model, we will explain it using a concrete scenario rather than just throwing out a definition—understanding *why* a model is needed is far more important than memorizing *how* it is defined. +Our goal now is to establish this spectrum using four consistency models, ranging from strong to weak. For each model, we will explain it with a concrete scenario rather than just throwing out a definition—understanding "why we need this model" is far more important than memorizing "how this model is defined." ### Linearizability: The Strongest Guarantee -We start with the strongest. Linearizability is also known as strong consistency or atomic consistency. It means that every operation appears to occur atomically at a **single point in time** between its invocation and its response, and the time points of all operations form a total order. In plain terms—if we treat the distributed system as a black box, from an external observer's perspective, all operations look as if they happened on a single machine. This echoes the `memory_order_seq_cst` we discussed in ch03: the strongest memory order on a single machine guarantees that all threads see a consistent operation order, and linearizability is the equivalent guarantee in a distributed environment. +We start with the strongest. Linearizability is also known as strong consistency or atomic consistency. It means that every operation appears to occur atomically at some **unique point in time** between its invocation and completion, and the points of all operations form a total order. Simply put—if we treat the distributed system as a black box, from an external observer's perspective, all operations look as if they happened on a single machine. This echoes the `memory_order_seq_cst` we discussed in ch03: the strongest memory ordering on a single machine guarantees all threads see a consistent order of operations, while linearizability is the equivalent guarantee in a distributed environment. -Let's illustrate this with a bank transfer scenario. Suppose you and your roommate share an account with a balance of 1000 yuan. You transfer 800 yuan out via your mobile app, and at the exact moment of transfer, your roommate checks the balance at an ATM. Under linearizability, your roommate's query has only two possible outcomes: either they see 1000 yuan (your transfer hasn't taken effect yet) or they see 200 yuan (your transfer has taken effect). It is impossible for your roommate to see an "intermediate state" like 500 yuan or 900 yuan. +Let's use a bank transfer scenario. Suppose you and your roommate share an account with a balance of 1000 yuan. You transfer 800 yuan out via your mobile app. The instant you make the transfer, your roommate checks the balance at an ATM. Under linearizability, your roommate's query has only two possible results: they either see 1000 yuan (your transfer hasn't taken effect yet) or 200 yuan (your transfer has taken effect). It is impossible for your roommate to see an intermediate state like 500 or 900 yuan. -Even more critical is the guarantee of time ordering: if you completed the transfer operation first (received a "transfer successful" response) and then your roommate initiated the query, your roommate is guaranteed to see 200 yuan—they cannot see the old value. This is the "real-time" guarantee of linearizability: the actual chronological order of operations matches the order presented by the system. +Even more critical is the guarantee of time ordering: if you complete the transfer operation first (and receive a "transfer successful" response), and then your roommate initiates a query, your roommate is guaranteed to see 200 yuan—they cannot see an old value. This is the "real-time" property of linearizability: the actual chronological order of operations matches the order presented by the system. -Linearizability is the strongest consistency guarantee, but it is also the most expensive. To implement it, every write operation must wait for acknowledgment from a majority of replicas before returning success, and every read operation must also query the majority for the latest value (or query the Leader and ensure the Leader hasn't changed). In terms of latency, this means at least one network round trip (usually multiple rounds). In terms of availability, it means that if a majority cannot be reached, the system must refuse service. +Linearizability is the strongest consistency guarantee, but it is also the most expensive. To implement it, every write operation must wait for confirmation from a majority of replicas before returning success, and every read operation must also query a majority for the latest value (or query the Leader and ensure the Leader hasn't changed). This implies at least one network round trip in latency (usually multiple rounds), and in terms of availability, if the majority cannot be reached, the system must refuse service. -Which systems provide linearizability? ZooKeeper (for write operations and synchronous reads), etcd, and Consul, which we mentioned in the previous article, all provide it. Google Spanner achieves external consistency (which is even stronger than linearizability) through the TrueTime API mentioned earlier, while many relational databases in single-machine mode are naturally linearizable. +Which systems provide linearizability? ZooKeeper (for writes and sync reads), etcd, and Consul, mentioned in the previous article, all provide it. Google Spanner achieves external consistency (even stronger than linearizability) via the TrueTime API mentioned in the last article, and many relational databases in single-machine mode are naturally linearizable. ### Sequential Consistency: Relaxing Time Requirements -Okay, linearizability is the strongest but also the most expensive. If we relax the requirements slightly—no longer requiring the actual chronological order of operations to match the system-presented order, but only requiring that all processes see the same operation order—we get sequential consistency. Specifically, the operation order seen by all processes is a total order, but this order does not have to align with the actual physical time of occurrence, as long as each process's own operations maintain the order specified in the program. +Okay, linearizability is the strongest but also the most expensive. If we relax the requirement slightly—we don't require the actual chronological order of operations to match the order presented by the system, we only require that all processes see the same order of operations—we get sequential consistency. Specifically, the order of operations seen by all processes is a total order, but this order doesn't have to match the actual physical time of occurrence, as long as each process's own operations maintain the order specified in the program. -Returning to the bank transfer example. Suppose you transfer 800 yuan out on your phone, and then your roommate transfers 500 yuan out at an ATM. Under sequential consistency, the system can present the order as "your roommate transfers 500 first, then you transfer 800"—the reverse of your actual physical operation order. The key point is: all observers see the same order. No one will say "transferred 800 first" while someone else says "transferred 500 first." +Returning to the bank transfer example. Suppose you transfer 800 yuan out on your phone, and then your roommate transfers 500 yuan out at an ATM. Under sequential consistency, the system can present the order as "your roommate transfers 500 first, then you transfer 800"—which is the reverse of your physical operation order. But the key is: all observers see the same order. One person won't say "transferred 800 first" while another says "transferred 500 first." -The difference between sequential consistency and linearizability lies precisely in that "real-time" constraint: linearizability requires the system-presented order to match actual time, while sequential consistency does not. However, both require a globally consistent arrangement of all operations. This difference may seem subtle, but it is hugely significant in implementation—linearizability requires some form of global clock or consensus protocol to synchronize time, whereas sequential consistency only needs to guarantee the atomic broadcast order of operations. +The difference between sequential consistency and linearizability lies in that "real-time" constraint: linearizability requires the system's presented order to match actual time, while sequential consistency does not. However, both require a globally consistent arrangement of all operations. This difference looks subtle, but it is significant in implementation—linearizability needs some form of global clock or consensus protocol to synchronize time, while sequential consistency only needs to guarantee the atomic broadcast order of operations. -### Causal Consistency: Preserving Causality, Not Global Order +### Causal Consistency: Preserving Causality, Not Globals -If we relax the constraints further—no longer requiring a total order for all operations, but only requiring that **causally related** operations be seen by all processes in the same order, while causally unrelated operations can be seen in different orders—we arrive at causal consistency. +If we relax constraints further, not requiring a total order of all operations, but only requiring that **causally related** operations be seen by all processes in the same order, while causally unrelated operations can be seen in different orders—this is causal consistency. -What does "causally related" mean? Simply put, if operation B reads a value written by operation A, then A and B have a causal relationship—A "caused" B. Or if operation C occurs after operation B (within the same process), and B causally depends on A, then C also causally depends on A. Beyond these direct and indirect dependency relationships, two operations are **concurrent**—there is no causal relationship between them. +What does "causally related" mean? Simply put, if operation B reads a value written by operation A, then A and B have a causal relationship—A "caused" B. Or if operation C occurs after operation B (within the same process), and B causally depends on A, then C also causally depends on A. Beyond these direct and indirect dependencies, two operations are **concurrent**—there is no causal relationship between them. -Let's explain this with a social media scenario. User Alice posts: "The weather is great today!" (Operation A). User Bob sees Alice's post and replies: "Indeed it is!" (Operation B). Operation B causally depends on Operation A—because Bob replied only after seeing Alice's post. Under causal consistency, any user will definitely see Alice's post first, and then Bob's reply—it is impossible to see Bob's reply without seeing Alice's post, as that would make no semantic sense. +Let's use a social media scenario to explain. User Alice posts a message: "The weather is great today!" (Operation A). User Bob sees Alice's post and replies: "Indeed it is!" (Operation B). Operation B causally depends on Operation A—because Bob replied only after seeing Alice's post. Under causal consistency, any user must definitely see Alice's post first, and then see Bob's reply—it is impossible to see Bob's reply but not Alice's post, as that makes no semantic sense. -At the same time, User Carol also posts: "Had hotpot today." (Operation C). Operation C and Operation A are concurrent—there is no causal relationship between them. Under causal consistency, different users can see A and C in different orders: some might see the weather post first and then the hotpot post, others the reverse—both are fine, because there is no "who caused who" relationship between them. +At the same time, user Carol also posts a message: "Had hotpot today." (Operation C). Operation C and Operation A are concurrent—there is no causal relationship between them. Under causal consistency, different users can see A and C in different orders: some see the weather post first then the hotpot post, others see it the other way around—both are fine, because there is no "who caused who" relationship between them. -Causal consistency is the practical choice for many distributed databases because its implementation cost is much lower than linearizability—you don't need global consensus, you only need to track and propagate causal relationships (usually using vector clocks) to guarantee semantic correctness. Dynamo-style systems (Amazon Dynamo, Apache Cassandra, Riak) provide eventual consistency with causal session guarantees under certain configurations. Strictly speaking, this is stronger than "pure" eventual consistency but weaker than strict causal consistency. +Causal consistency is a practical choice for many distributed databases because its implementation cost is much lower than linearizability—you don't need global consensus, only need to track and propagate causal relationships (usually using vector clocks) to guarantee semantic correctness. Dynamo-style systems (Amazon Dynamo, Apache Cassandra, Riak) provide eventual consistency with causal session guarantees in certain configurations, which is strictly speaking stronger than "pure" eventual consistency but weaker than strict causal consistency. ### Eventual Consistency: Weakest but Fastest -At the very bottom of the spectrum is eventual consistency. Its guarantee is very weak: if there are no new writes, eventually ("eventually" is a vague time point, which could be milliseconds, seconds, or even minutes) all replicas will converge to the same value. Before convergence, different replicas may return different values—you might read the latest write from one replica and a five-second-old stale value from another. +At the bottom of the spectrum is eventual consistency. Its guarantee is very weak: if there are no new writes, eventually ("eventually" is a vague point in time, maybe milliseconds, seconds, or even minutes) all replicas will converge to the same value. Before convergence, different replicas may return different values—you might read the latest write from one replica and an old value from five seconds ago from another. -This guarantee sounds unreliable, but it is sufficient in many scenarios. DNS is a classic example of eventual consistency: when you update a DNS record, DNS servers worldwide may take minutes or even hours to fully update—but in most cases, this is perfectly acceptable. Like counts on social media, follower lists, comment counts—updating these data by a second or two has no disastrous consequences. +This guarantee sounds unreliable, but it is sufficient in many scenarios. DNS is a typical example of eventual consistency: you update a DNS record, and it may take minutes or even hours for all DNS servers globally to update—but in most cases, this is perfectly acceptable. Like counts, follower lists, and comment counts on social media—updating this data with a delay of a second or two has no catastrophic consequences. -The advantage of eventual consistency lies in performance and availability: because there is no need to synchronously wait for other replicas, writes can return success immediately, and reads only need to access the local replica. In the event of a network partition, each replica can independently serve requests—availability is maxed out. +The advantage of eventual consistency lies in performance and availability: because there is no need to wait synchronously for other replicas, writes can return success immediately, and reads only need to access the local replica. In the event of a network partition, each replica can serve requests independently—maximizing availability. -### The Hierarchy of Consistency Models +### Hierarchy of Consistency Models -Great, now let's look at all four models together. They form a hierarchy from strong to weak: +Great, now let's look at the four models together. They form a hierarchy from strong to weak: ```mermaid flowchart TD @@ -91,54 +93,54 @@ flowchart TD C -->|"满足因果一致 → 必然满足以下所有"| D["最终一致性
(Eventual Consistency)"] ``` -The hierarchical relationship means: a system that satisfies linearizability also satisfies sequential consistency, causal consistency, and eventual consistency. Conversely, a system that satisfies eventual consistency does not necessarily satisfy causal consistency. For each step up the hierarchy, you gain stronger consistency guarantees, but you also pay a higher price in latency and availability. +The hierarchical relationship means: a system satisfying linearizability also satisfies sequential consistency, causal consistency, and eventual consistency. Conversely, a system satisfying eventual consistency does not necessarily satisfy causal consistency. Every step up the ladder, you gain stronger consistency guarantees, but you also pay a higher price in latency and availability. > ⚠️ **Pitfall Warning** -> In reality, few systems "purely" implement only one consistency model—I learned this the hard way, initially assuming a certain database "was" eventually consistent, only to discover that under specific configurations it actually provided stronger consistency guarantees. Many systems offer tunable consistency levels. For example, Cassandra supports three read/write consistency levels: ONE, QUORUM, and ALL, and you can choose at each operation. QUORUM reads and writes guarantee reading the latest written value (because the majority for writes and the majority for reads must overlap), but this does not strictly guarantee linearizability—truly strict linearizability requires additional mechanisms (like Raft's ReadIndex or lease read). Understanding what guarantees your system provides under what configuration is far more important than memorizing theoretical definitions. +> In reality, few systems "purely" implement only one consistency model—I've stepped in this hole before, thinking a certain database "is just" eventually consistent, only to find that under specific configurations it actually provided stronger consistency guarantees. Many systems offer tunable consistency levels; for example, Cassandra supports THREE consistency levels for reads and writes: ONE, QUORUM, and ALL. You can choose at each operation. QUORUM reads and writes guarantee reading the latest written value (because the majorities for write and read must overlap), but this does not strictly guarantee linearizability—truly strict linearizability requires additional mechanisms (like Raft's ReadIndex or lease read). Understanding what guarantees your system provides under what configuration is far more important than memorizing theoretical definitions. ## Core Ideas of Paxos/Raft -After understanding the spectrum of consistency models, a natural question arises: if we need strong consistency (such as linearizability), how exactly do we implement it? The answer is through **consensus protocols**. In the world of distributed systems, the core problem that consensus protocols solve is: getting a group of machines to agree on a value—even if some of those machines might crash or the network might partition. This shares a similar spirit with the atomic operations we discussed in ch03—both are about getting multiple execution units (threads or machines) to reach agreement on the state of a value. The difference is that atomic operations rely on the CPU's cache coherence protocol, while distributed consensus relies on multiple rounds of network communication and voting. +After understanding the spectrum of consistency models, a natural question arises: if we need strong consistency (like linearizability), how do we implement it specifically? The answer is through **consensus protocols**. In the world of distributed systems, the core problem consensus protocols solve is: getting a group of machines to agree on a value—even if some machines crash or the network partitions. This shares a similar spirit with the atomic operations we discussed in ch03—both are about getting multiple execution units (threads or machines) to agree on the state of a value, except atomic operations rely on the CPU's cache coherence protocol, while distributed consensus relies on multiple rounds of network communication and voting. -Let's be upfront: we don't plan to give a complete protocol description of Paxos or Raft here (that's truly the work of a full paper—Lamport's Paxos paper reads like a Greek myth, and while the Raft paper is much clearer, it's still over thirty pages). Instead, we will focus on the core ideas so you understand *why* they are designed this way. +First, let's be clear: we don't plan to give a complete protocol description of Paxos or Raft here (that's really a paper's worth of work; Lamport's Paxos paper reads like a Greek myth, and the Raft paper is very clear but still over thirty pages). Instead, we focus on the core ideas to help you understand "why it's designed this way." -### Why a Quorum? +### Why We Need a Quorum The cornerstone of consensus protocols is the **quorum**. Suppose we have $N$ machines, and a value needs to be accepted by at least $\lfloor N/2 \rfloor + 1$ machines (i.e., a majority) to be considered "decided." Your first reaction might be—why a majority? Why not require unanimous agreement? -The core insight is: any two majorities must overlap. If there are 5 machines, a majority is at least 3. No matter how you choose them, any two groups of 3 machines share at least 1 machine in common. This overlap means: if a previous value has already been accepted by one majority, then any new majority must contain at least one machine that knows the previous value. As long as the protocol is designed properly, this "witness" machine can guarantee that the new value will not overwrite the previously decided value. +The core insight is: any two majorities must overlap. If there are 5 machines, a majority is at least 3. No matter how you choose, there is at least 1 machine in common between any two groups of 3 machines. This overlap means: if a previous value has been accepted by a majority, then any new majority must contain at least one machine that knows the previous value. As long as the protocol is designed properly, this "witness" machine can guarantee that the new value will not overwrite the previously decided value. -From this insight, tolerating $f$ machine crashes requires at least $2f + 1$ machines. In other words, to tolerate 1 crash you need 3 machines ($3 = 2 \times 1 + 1$), and to tolerate 2 crashes you need 5 machines ($5 = 2 \times 2 + 1$). This is why you often see coordination services like ZooKeeper, etcd, and Consul recommend deployments of 3 or 5 nodes—3 nodes tolerate 1 node failure, and 5 nodes tolerate 2 node failures. +Starting from this insight, tolerating $f$ machine failures requires at least $2f + 1$ machines—in other words, to tolerate 1 crash you need 3 machines ($3 = 2 \times 1 + 1$), and to tolerate 2 crashes you need 5 machines ($5 = 2 \times 2 + 1$). This is why coordination services like ZooKeeper, etcd, and Consul often recommend deploying 3 or 5 nodes—a 3-node cluster tolerates 1 node failure, and a 5-node cluster tolerates 2 node failures. ### Leader Election: Who Gives the Orders -Having understood the principle of quorums, let's look at Raft. Raft's design philosophy can be summarized in one sentence: "understandability first." When Diego Ongaro and John Ousterhout designed Raft, they explicitly made "easy to understand" a goal equally important as "correctness," forming a stark contrast with Paxos's "correct but unreadable" style. Raft decomposes consensus into three sub-problems: Leader election, log replication, and safety. Let's look at Leader election first. +Understanding the principle of a quorum, let's look at Raft. Raft's design philosophy can be summarized in one sentence: "understandability first." When designing Raft, Diego Ongaro and John Ousterhout explicitly made "easy to understand" a goal as important as "correctness," which contrasts sharply with Paxos's style of "correct but no one can read it." Raft decomposes consensus into three sub-problems: leader election, log replication, and safety. Let's look at leader election first. -In Raft, there is at most one Leader in the cluster at any time—all write requests are handled by the Leader, and all logs are replicated from the Leader to Followers. This "strong Leader" design is easier to understand and implement than Paxos's "multi-Proposer" model. +In Raft, there is at most one Leader in the cluster at any time—all write requests are handled by the Leader, and all logs are replicated to Followers by the Leader. This "strong Leader" design is easier to understand and implement than Paxos's "multi-Proposer" model. -Leader election is driven by **terms** and **heartbeats**. Each term is a monotonically increasing integer, and there is at most one Leader per term. Under normal circumstances, the Leader periodically sends heartbeats (AppendEntries RPCs, even empty ones when there are no logs to replicate) to all Followers. If a Follower does not receive a heartbeat within an election timeout period, it assumes the Leader has failed and initiates a new election round. +Leader election is driven by **terms** and **heartbeats**. Each term is a monotonically increasing integer, and there is at most one Leader per term. Normally, the Leader periodically sends heartbeats to all Followers (AppendEntries RPC, even if there are no logs to replicate, empty heartbeats are sent). If a Follower does not receive a heartbeat within an election timeout, it assumes the Leader is down and starts a new election. -The election process, in plain terms, is "a group of people voting for a leader": the Follower increments the current term, becomes a Candidate, votes for itself first, and then sends a RequestVote RPC to all other nodes. The voting rules for other nodes are: at most one vote per term, first-come-first-served (with one caveat: the Candidate's log must be at least as up-to-date as the voter's). If a Candidate receives votes from a majority, it becomes the new Leader and immediately starts sending heartbeats to prevent others from initiating further elections. +The election process, in plain terms, is "a group of people voting for a leader": the Follower increments the current term, becomes a Candidate, votes for itself first, and then sends RequestVote RPCs to all other nodes. The voting rules for other nodes are: one vote per term at most, first come first served (but with a restriction: the Candidate's log must be at least as new as the voter's). If a Candidate receives votes from a majority, it becomes the new Leader and immediately starts sending heartbeats to prevent others from initiating elections. -This process has a clever randomization mechanism: each node's election timeout is randomly chosen within a range. This greatly reduces the probability of multiple nodes initiating elections simultaneously and causing a "split vote"—because their timeout durations differ, the node that times out first will usually initiate the election ahead of the others and win the majority. +This process has a clever randomization mechanism: each node's election timeout is randomly chosen within a range. This greatly reduces the probability of multiple nodes initiating elections simultaneously causing "vote splitting"—because their timeout times differ, the node that times out first will usually initiate the election first and win the majority of votes. -### Log Replication: The Leader Speaks, Followers Follow +### Log Replication: Leader Speaks, Followers Follow -Once the Leader is elected, log replication is quite straightforward—the core of the entire process is "the Leader says one thing, the Followers repeat it." The client sends a write request to the Leader, the Leader appends the operation to its own log, and then replicates this log entry to all Followers (via AppendEntries RPC). When the Leader confirms that this log entry has been accepted by a majority (including itself), it **commits** the log entry, applies it to the state machine, and returns success to the client. +Once the Leader is selected, log replication is straightforward—the core of the whole process is "Leader says one sentence, Followers repeat it." The client sends a write request to the Leader, the Leader appends the operation to its own log, and then replicates this log entry to all Followers (via AppendEntries RPC). When the Leader confirms that this log entry has been accepted by a majority (including itself), it **commits** the log and applies it to the state machine, then returns success to the client. -The key safety guarantee is: committed logs are never overwritten. Raft achieves this through a simple constraint—when sending AppendEntries, the Leader includes the index and term of the previous log entry. Upon receiving this, the Follower checks whether the corresponding position in its own log matches. If it doesn't match, the Follower rejects the log entry, and the Leader backs off and retries until it finds a position where both sides agree, then starts overwriting from there. +A key safety guarantee is that committed logs are never overwritten. Raft achieves this through a simple constraint—when sending AppendEntries, the Leader carries the index and term of the previous log entry. After receiving it, the Follower checks if the corresponding position in its own log matches. If it doesn't match, the Follower refuses to accept this log entry, and the Leader will backtrack and retry until it finds a position where both sides agree and starts overwriting from there. -This mechanism guarantees: if two log entries have the same term number at the same index position on any Follower, their contents must be identical (because a Leader only creates one log entry at one index position within a term), and all log entries preceding that entry are also identical (through recursive matching checks). This is log consistency. +This mechanism guarantees: if two log entries have the same term number at the same index position in any Follower, their content must be the same (because the Leader only creates one log entry at an index position within a term), and all logs before that entry are also the same (through recursive matching checks). This is log consistency. -To summarize the entire Raft process with an analogy: imagine a committee (the cluster) whose members communicate via letters (network messages). They need to reach agreement on a series of decisions (the log). Raft's approach is to first elect a chairperson (Leader election), have the chairperson propose all decisions (log replication), and require majority agreement for a decision to take effect (quorum voting). If the chairperson loses contact, the committee votes to elect a new chairperson to continue the work. While this analogy is rough, it captures the core design philosophy of Raft—the key to consensus is not "everyone agrees," but "a majority agreeing is enough," and the intersection of majorities guarantees the transfer of information. +To summarize the entire Raft process with an analogy: imagine a committee (the cluster) where members communicate by mail (network messages). They need to reach agreement on a series of decisions (logs). Raft's approach is to first elect a chairperson (Leader election), the chairperson proposes all decisions (log replication), and decisions need a majority agreement to take effect (majority voting). If the chairperson loses contact, the committee votes to elect a new chairperson to continue the work. Although this analogy is rough, it captures the core design idea of Raft—the key to consensus is not "everyone agrees," but "a majority agreeing is enough," and the intersection of majorities guarantees the transmission of information. ## C++ Practice Directions -We've covered quite a bit of theory; now let's look at something practical. Having understood the theoretical foundations of distributed consistency, let's explore the direction for actually writing distributed communication code in C++. To be clear—we won't implement a complete distributed protocol (that's the scale of an independent project; a correct implementation of Raft alone can take weeks of work). Instead, we will show how to use gRPC + C++20 coroutines to build the basic skeleton for communication between distributed services. This leverages the coroutine knowledge we learned in ch06, effectively connecting the dots from our earlier studies. +We've covered a lot of theory; now let's look at something practical. After understanding the theoretical basis of distributed consistency, let's look at the direction of writing distributed communication code in C++. To be clear—we won't implement a complete distributed protocol (that's the scale of an independent project; a correct implementation of Raft can take weeks of work). Instead, we show how to use gRPC + C++20 coroutines to build the basic skeleton for communication between distributed services. This uses the coroutine knowledge we learned in ch06, connecting the dots from our previous accumulation. ### gRPC Basics: Defining Services with Protobuf -gRPC uses Protocol Buffers (protobuf) to define service interfaces and message formats. This is the key infrastructure in the modern C++ ecosystem that bridges "concurrency" and "distributed systems," as we mentioned in the previous article. Suppose we want to implement a simple distributed key-value store service; the proto file would look something like this: +gRPC uses Protocol Buffers (protobuf) to define service interfaces and message formats, which is the key infrastructure in the modern C++ ecosystem mentioned in the previous article that connects "concurrency" and "distribution." Suppose we want to implement a simple distributed key-value storage service; the proto file would look something like this: ```protobuf // kv_store.proto @@ -188,11 +190,11 @@ message DeleteResponse { } ``` -After compiling with the `protoc` compiler, you will get a bunch of ``.pb.h`` and ``.pb.cc`` files, plus a ``.grpc.pb.h`` and ``.grpc.pb.cc``—the latter contains gRPC's server base class and client stub code. Don't be intimidated by this pile of generated files; the only things you really need to care about are the base class and the stub class. +After compiling with the `protoc` compiler, you will get a bunch of `.pb.h` and `.pb.cc` files, as well as a `.grpc.pb.h` and `.grpc.pb.cc`—the latter contains the gRPC server base class and client stub code. Don't be intimidated by this pile of generated files; the only things you really need to care about are the base class and the stub class. ### Server Implementation: Handling RPC Requests -Next, let's look at the server implementation—inheriting from the generated ``KvStoreService::Service`` base class and overriding each RPC method. We use a simple in-memory map as the storage backend, protected by ``std::shared_mutex`` for thread safety. If you remember the read-write lock pattern from ch02, this is its direct application. +Next, let's look at the server implementation—inheriting the generated `KvStoreService::Service` base class and overriding each RPC method. We use a simple in-memory map as the storage backend, protected by `std::shared_mutex` for thread safety. If you remember the read-write lock pattern discussed in ch02, this is its direct application. ```cpp // kv_store_server.h @@ -286,9 +288,9 @@ private: }; ``` -This code demonstrates several important design points. We used ``std::shared_mutex`` instead of ``std::mutex`` to protect the storage—read operations (Get) use a shared lock (``std::shared_lock``), and write operations (Put/Delete) use an exclusive lock (``std::unique_lock``). This is consistent with the read-write lock pattern we discussed in ch02: in read-heavy, write-light scenarios, shared locks can significantly improve concurrency. Another point worth noting is the ``expected_version`` field in the Put request—this is an implementation of Optimistic Concurrency Control (OCC). When a client reads a value, it receives a version number, and when it writes back after modification, it includes this version number. If the server finds that the current version number doesn't match the client's expectation, it means someone else has already modified this value, and the write is rejected—the client needs to re-read, re-modify, and re-submit. This is much lighter than using a distributed lock and avoids the various security issues of distributed locks that we discussed in the previous article. +This code demonstrates several important design points. We used `std::shared_mutex` instead of `std::mutex` to protect storage—read operations (Get) use a shared lock (`std::shared_lock`), and write operations (Put/Delete) use an exclusive lock (`std::unique_lock`). This is consistent with the read-write lock pattern we discussed in ch02: in read-heavy, write-light scenarios, shared locks can significantly improve concurrency. Another point worth noting is the `expected_version` field in the Put request—this is an implementation of Optimistic Concurrency Control (OCC). When a client reads a value, it gets a version number; after modifying, it writes back with this version number. If the server finds the current version number doesn't match the client's expectation, it means someone else has already modified the value, and the write is rejected—the client needs to re-read, re-modify, and re-submit. This is much lighter than a distributed lock and avoids the various security issues of distributed locks discussed in the previous article. -The code to start the server is also very concise: +Starting the server is also very concise: ```cpp // main.cpp(服务端) @@ -315,9 +317,9 @@ int main() ### Asynchronous gRPC: Wrapping CompletionQueue with Coroutines -So far, we've been using gRPC's **synchronous API**—every RPC call blocks the current thread until it completes. This is fine for low-concurrency scenarios, but if you use the synchronous model in high-concurrency situations (for example, a server needs to handle thousands of requests simultaneously), the number of threads will skyrocket, and context switching will directly become a bottleneck—this is the same problem we discussed in ch06 about "why we need asynchrony." +So far, we've been using gRPC's **synchronous API**—every RPC call blocks the current thread until completion. This is fine in low-concurrency scenarios, but if you use the synchronous model in high-concurrency scenarios (e.g., a server needs to handle thousands of requests simultaneously), the number of threads explodes, and context switching becomes a direct bottleneck—this is the same problem we discussed in ch06 regarding "why we need async." -gRPC provides an asynchronous API, the core of which is the ``CompletionQueue`` (CQ)—an event loop where all completed asynchronous operations post a completion event to the CQ, and you need a thread to continuously pull events from the CQ and process them. This model is very similar to the asynchronous I/O we discussed in ch06: essentially, it's event-driven + callbacks. But writing code directly with the CQ is extremely tedious—you need to manually manage the lifecycle of request objects, manually handle various state transitions, and manually chain callbacks together. If we use C++20 coroutines to wrap the CQ, we can dramatically improve code readability. Let's look at a simplified example of a coroutine-based gRPC client call. +gRPC provides an asynchronous API, centered on the `CompletionQueue` (CQ)—an event loop where all asynchronous operations post a completion event to the CQ when done, and you need a thread to continuously pull events from the CQ and process them. This model is very similar to the asynchronous I/O we discussed in ch06: essentially event-driven + callbacks. But writing code directly with the CQ is very cumbersome—you need to manually manage the lifecycle of request objects, manually handle various state transitions, and manually chain callbacks together. If we use C++20 coroutines to wrap the CQ, we can significantly improve code readability. Let's look at a simplified example of a coroutine-based gRPC client call. ```cpp #pragma once @@ -427,9 +429,9 @@ private: }; ``` -The core of this code lies in the ``GrpcAwaitable`` struct—it is an object that satisfies the C++20 coroutine ``awaitable`` constraint, which is the exact mechanism we discussed in depth in ch06. When the coroutine ``co_await`` this object, ``await_suspend`` is called, which starts the gRPC asynchronous call and registers the coroutine handle as a tag on the ``CompletionQueue``. When the gRPC asynchronous operation completes, the CQ's event loop pulls out this tag (which is actually the coroutine handle), and then ``resume()`` resumes the coroutine. After the coroutine resumes, it retrieves the response result in ``await_resume``—the entire flow follows the exact same pattern as the hand-written awaitable we did in ch06. +The core of this code lies in the `GrpcAwaitable` structure—it is an object that satisfies the C++20 coroutine `awaitable` constraint, which is the mechanism we discussed in depth in ch06. When the coroutine `co_await` this object, `await_suspend` is called, which initiates the gRPC asynchronous call and registers the coroutine handle as a tag with the `CompletionQueue`. When the gRPC asynchronous operation completes, the CQ event loop pulls out this tag (which is actually the coroutine handle), and then `resume()` resumes the coroutine execution. After the coroutine resumes, it gets the response result in `await_resume`—the whole process is exactly the same set of routines as the awaitable we wrote by hand in ch06. -At the application layer, you can use it like this: +In application layer code, you can use it like this: ```cpp /// @brief 示例:使用协程化的 gRPC 客户端 @@ -463,56 +465,56 @@ Task demo_usage(KvStoreCoroutineClient& client) } ``` -See? The application-layer code is almost indistinguishable from writing a local function call—``co_await`` makes the asynchronous gRPC call look as linear and fluent as synchronous code, but underneath it is completely asynchronous: while waiting for the gRPC response, the current thread doesn't block; instead, it goes off to handle other coroutines or CQ events. This is the value of coroutines that we repeatedly emphasized in ch06—not making code faster, but making asynchronous code readable and maintainable. +You see, the application layer code is almost indistinguishable from writing a local function call—`co_await` makes asynchronous gRPC calls look as linear and smooth as synchronous code, but the underlying reality is completely asynchronous: while waiting for the gRPC response, the current thread doesn't block but instead goes to handle other coroutines or CQ events. This is the value of coroutines we emphasized repeatedly in ch06—not to make code faster, but to make asynchronous code readable and maintainable. > ⚠️ **Pitfall Warning** -> The ``GrpcAwaitable`` above is a simplified example that demonstrates the core idea of coroutine-based gRPC. Don't drop it directly into a production environment. In production, you need to handle many more details: graceful shutdown of the CQ event loop, timeout control, retry logic, connection state management, thread-safe CQ access, and so on. If you don't want to build this wheel yourself (I strongly recommend you don't), check out the [agrpc](https://github.com/Tradias/agrpc) library—it provides production-grade asynchronous gRPC wrappers based on Boost.Asio's C++20 coroutine support. +> The `GrpcAwaitable` above is a simplified example demonstrating the core idea of coroutine-based gRPC; don't take it directly to production. In a production environment, you need to handle more details: graceful shutdown of the CQ event loop, timeout control, retry logic, connection state management, thread-safe CQ access, etc. If you don't want to reinvent the wheel (I strongly suggest you don't), take a look at the [agrpc](https://github.com/Tradias/agrpc) library—it provides production-grade gRPC asynchronous wrapping based on Boost.Asio's C++20 coroutine support. -## Summary: The Journey of Volume 5 +## Summary: The Journey of Volume Five -This brings us to the end of the final article in Volume 5. Looking back at the learning path of this volume, we've traveled from "what is a thread" all the way to "how distributed systems communicate"—and that has indeed been quite a journey. +This concludes the final article of Volume Five. Looking back at the learning path of this volume, we have traveled from "what is a thread" to "how distributed systems communicate"—this is indeed a significant journey. -**ch00 Concurrency Fundamentals** — We established a basic understanding of concurrency: concurrency and parallelism are not the same thing, Amdahl's Law and Gustafson's Law help us understand the upper and lower bounds of speedup, the trade-off between throughput and latency guides architectural choices, and some scenarios don't need concurrency at all. Correctness first, performance second—this has been our guiding principle throughout the volume. +**ch00 Concurrency Basics**—We established the basic cognition of concurrency: concurrency and parallelism are not the same thing; Amdahl's Law and Gustafson's Law help us understand the upper and lower bounds of speedup; the trade-off between throughput and latency guides architecture selection; and some scenarios don't need concurrency at all. Correctness first, performance second—this is the principle we have carried through the entire volume. -**ch01 Thread Lifecycle and RAII** — We got to know the lifecycle of ``std::thread``, understood the differences between ``join()`` and ``detach()``, and learned to use RAII guards to manage thread resources, ensuring threads don't leak or get forgotten. This is the foundational skill of concurrent programming. +**ch01 Thread Lifecycle and RAII**—We got to know the lifecycle of `std::thread`, understood the difference between `join()` and `detach()`, and learned to use RAII guards to manage thread resources, ensuring threads don't leak or get forgotten. This is the basic skill of concurrent programming. -**ch02 Synchronization Primitives** — ``std::mutex``, ``std::condition_variable``, ``std::shared_mutex``... these are the toolbox of concurrent programming. We learned to use them to protect shared data, coordinate execution order between threads, and implement the producer-consumer pattern. We also saw their limitations: lock granularity is hard to control, deadlocks are easy to trigger, and performance is suboptimal under high contention. +**ch02 Synchronization Primitives**—`std::mutex`, `std::condition_variable`, `std::shared_mutex`... these are the toolbox of concurrent programming. We learned to use them to protect shared data, coordinate execution order between threads, and implement producer-consumer patterns. We also saw their limitations: lock granularity is hard to control, deadlocks are easy, and performance is poor in high contention scenarios. -**ch03 Atomic Operations and the Memory Model** — This is one of the hardest-core parts of Volume 5, and also one of the most satisfying to write. Starting from the basic usage of ``std::atomic``, we dove deep into the six memory orders of the C++ memory model (``memory_order_relaxed``, ``memory_order_consume``, ``memory_order_acquire``, ``memory_order_release``, ``memory_order_acq_rel``, ``memory_order_seq_cst``), understood the reordering rules of compilers and CPUs, and mastered the reasoning method for happens-before relationships. This knowledge lets you know what you're doing when writing lock-free code. +**ch03 Atomic Operations and Memory Model**—This is one of the hardest core parts of Volume Five, and also the most enjoyable part for me to write. Starting from the basic usage of `std::atomic`, we went deep into the six memory orders of the C++ memory model (`memory_order_relaxed`, `memory_order_consume`, `memory_order_acquire`, `memory_order_release`, `memory_order_acq_rel`, `memory_order_seq_cst`), understood the reordering rules of compilers and CPUs, and mastered the reasoning method of happens-before relationships. This knowledge lets you know what you are doing when writing lock-free code. -**ch04 Concurrent Data Structures** — We applied the synchronization primitives and atomic operations we learned earlier to specific data structures: thread-safe queues, concurrent maps, and ring buffers. We saw the trade-offs between different strategies: coarse-grained locking, fine-grained locking, read-write locking, and lock-free approaches. +**ch04 Concurrent Data Structures**—We applied the synchronization primitives and atomic operations learned earlier to specific data structures: thread-safe queues, concurrent maps, ring buffers. We saw the trade-offs between different strategies like coarse-grained locks, fine-grained locks, read-write locks, and lock-free. -**ch05 Tasks, Futures, and Thread Pools** — We elevated ourselves from the "raw thread" level to the "task" level. ``std::async``, ``std::future``, and ``std::promise`` provide higher-level concurrency abstractions, while thread pools let us reuse thread resources and control concurrency levels. The task-oriented mindset is better suited to most application scenarios than the thread-oriented mindset. +**ch05 Tasks, Futures, and Thread Pools**—We elevated from the "bare thread" level to the "task" level. `std::async`, `std::future`, and `std::promise` provide higher-level concurrency abstractions, while thread pools allow us to reuse thread resources and control concurrency. The task mindset is more suitable for most application scenarios than the thread mindset. -**ch06 Asynchrony and Coroutines** — C++20 coroutines represent a major paradigm shift in concurrent programming. Starting from the basic mechanisms of coroutines (``co_await``, ``co_return``, ``co_yield``, ``promise_type``, ``awaitable``), we learned to rewrite callback-based asynchronous code into a linear, readable form using coroutines. Coroutines are not a silver bullet, but they genuinely take the maintainability of asynchronous code to the next level. +**ch06 Asynchronous and Coroutines**—C++20 coroutines are a major paradigm shift in concurrent programming. Starting from the basic mechanisms of coroutines (`co_await`, `co_return`, `co_yield`, `promise_type`, `awaitable`), we learned to rewrite callback-style asynchronous code into linear, readable forms using coroutines. Coroutines are not a silver bullet, but they do improve the maintainability of asynchronous code by a step. -**ch07 Actors and Channels** — We stepped out of the "shared memory + locks" model and explored message-passing-based concurrency paradigms. The Actor model and the CSP/Channel model avoid data races by "sharing nothing and communicating only through messages," making them naturally suited for multi-core and distributed scenarios. +**ch07 Actor and Channel**—We stepped out of the "shared memory + locks" model and explored message-passing-based concurrency paradigms. The Actor model and CSP/Channel model use "share nothing, communicate only via messages" to avoid data races, making them naturally suitable for multi-core and distributed scenarios. -**ch08 Debugging and Performance** — Concurrency bugs are the hardest bugs to debug. We learned to use ThreadSanitizer to detect data races, used profiling tools to locate lock contention, and understood performance pitfalls like false sharing and lock convoys. +**ch08 Debugging and Performance**—Concurrent bugs are the hardest to debug. We learned to use ThreadSanitizer to detect data races, use profiling tools to locate lock contention, and understood performance traps like false sharing and lock convoys. -**ch09 Distributed Bridging** — That is, these two articles. Starting from the boundaries of single-machine concurrency, we saw the five fundamental differences of distributed systems, understood the spectrum of consistency models, learned the core ideas of the Paxos/Raft consensus protocols, and finally used gRPC + C++20 coroutines to demonstrate the direction for writing distributed communication code in C++. +**ch09 Distributed Bridging**—That is, these two articles. Starting from the boundaries of single-machine concurrency, we saw the five fundamental differences of distributed systems, understood the spectrum of consistency models, recognized the core ideas of Paxos/Raft consensus protocols, and finally used gRPC + C++20 coroutines to show the direction of writing distributed communication code in C++. -Looking back, none of these steps were isolated. The RAII mindset from ch01 runs through the entire volume—from thread management to lock management to connection management. The memory model knowledge from ch03 is the foundation for understanding the consistency models in ch09 (``memory_order_seq_cst`` and linearizability essentially answer the same question). The coroutine mechanism from ch06 is the cornerstone of the asynchronous gRPC wrapping in ch09. The Actor model from ch07 gains its greatest value in distributed environments—location transparency allows local code to be deployed to multiple machines with almost no changes. +Looking back, no step is isolated. The RAII mindset of ch01 runs through the entire volume—from thread management to lock management to connection management; the memory model knowledge of ch03 is the foundation for understanding the consistency models of ch09 (`memory_order_seq_cst` and linearizability essentially answer the same question); the coroutine mechanism of ch06 is the cornerstone of ch09's gRPC asynchronous wrapping; the Actor model of ch07 gains maximum value in a distributed environment—location transparency allows local code to be deployed to multiple machines with almost no changes. -Learning concurrent programming is never "complete"—it is a field that requires continuous practice, continuous stumbling into pitfalls, and continuous intuition building. But if you've followed Volume 5 to this point, you should already have a solid theoretical foundation and enough practical experience to face the vast majority of concurrency scenarios. What remains is to hone your skills in real projects. +Learning concurrent programming is never "complete"—this is an area that requires continuous practice, stepping into pits, and building intuition. But if you have followed Volume Five to here, you should have a solid theoretical foundation and enough practical experience to face the vast majority of concurrent scenarios. The rest is to hone it in real projects. ### Directions for Further Learning -If you want to deepen the foundation built in Volume 5, here are some directions I've personally tested and recommend. +If you want to continue deepening the foundation established in Volume Five, here are some directions I personally recommend. -**Book Recommendations**: Martin Kleppmann's *Designing Data-Intensive Applications* is universally recognized as the best introductory book in the distributed systems field, covering core topics like consistency, consensus, and replication—I strongly recommend reading at least the first five chapters. Anthony Williams's *C++ Concurrency in Action* is the authoritative reference for C++ concurrent programming; the second edition covers the C++17 standard (the third edition is expected to cover C++20), and it's a "dictionary" you can keep on your desk for随时 consultation. If you're particularly interested in lock-free programming, Herlihy and Shavit's *The Art of Multiprocessor Programming* is a classic textbook—though it leans academic and has a certain barrier to entry. +**Book Recommendations**: Martin Kleppmann's *Designing Data-Intensive Applications* is recognized as the best introductory book in the field of distributed systems, covering core topics like consistency, consensus, replication, and partitioning—I strongly recommend reading at least the first five chapters. Anthony Williams' *C++ Concurrency in Action* is the authoritative reference for C++ concurrent programming; the second edition covers the C++17 standard (the third edition is expected to cover C++20), and it is a "dictionary" you can keep on your desk for reference at any time. If you are particularly interested in lock-free programming, Herlihy and Shavit's *The Art of Multiprocessor Programming* is a classic textbook—but this book is more academic and has a certain threshold for reading. -**Open Source Projects**: If you want to see real distributed consensus protocol implementations, etcd's Raft implementation (in Go, about 2000 lines of core code) is the best starting point—detailed comments, clear logic, and every concept from the Raft paper can be found in the code, making it a very comfortable read. In the C++ ecosystem, Apache brpc is an open-source C++ RPC framework from Baidu that includes components like bvar (concurrent variables) and bthread (coroutine scheduling), making it great material for studying production-grade C++ concurrent code. +**Open Source Projects**: If you want to see real distributed consensus protocol implementations, etcd's Raft implementation (Go language, about 2000 lines of core code) is the best entry point—detailed comments, clear logic, and every concept in the Raft paper can be found in the code, making it very comfortable to read. In the C++ ecosystem, Apache brpc is a C++ RPC framework open-sourced by Baidu, built with components like bvar (concurrent variables) and bthread (coroutine scheduling), making it good material for learning production-grade C++ concurrent code. -**Practice Directions**: If you want to dive deeper into distributed systems development in C++, you can try implementing a simple distributed key-value store using gRPC + a Raft library (like ``libraft``)—this is a classic lab project from MIT 6.824 (Distributed Systems). The engineering effort is moderate but the coverage is broad, and after completing it, your understanding of consensus protocols will be completely different. +**Practice Directions**: If you want to go deeper into distributed systems development in C++, you can try implementing a simple distributed key-value storage using gRPC + a Raft library (like `libraft`)—this is a classic experimental project from MIT 6.824 (Distributed Systems), with moderate engineering effort but wide coverage; after completing it, your understanding of consensus protocols will be completely different. -## References +## Reference Resources -- [Designing Data-Intensive Applications — Martin Kleppmann](https://dataintensive.net/) — The "bible" of distributed systems, covering all core topics including consistency, consensus, and replication -- [C++ Concurrency in Action, 2nd Edition — Anthony Williams](https://www.manning.com/books/c-plus-plus-concurrency-in-action-second-edition) — The authoritative reference for C++ concurrent programming (the third edition is expected to cover C++20) -- [In Search of an Understandable Consensus Algorithm (Raft paper)](https://raft.github.io/raft.pdf) — The Raft paper by Diego Ongaro and John Ousterhout, 100 times more readable than the Paxos paper -- [The Part-Time Parliament (Paxos paper) — Leslie Lamport](https://lamport.azurewebsites.net/pubs/lamport-paxos.pdf) — The original Paxos paper, which tells the story of a consensus protocol through an ancient Greek parliament -- [Jepsen Consistency Models](https://jepsen.io/consistency/models) — A visual hierarchy diagram and detailed explanation of consistency models -- [agrpc — gRPC with C++20 Coroutines](https://github.com/Tradias/agrpc) — An asynchronous coroutine wrapper library for gRPC based on Boost.Asio +- [Designing Data-Intensive Applications — Martin Kleppmann](https://dataintensive.net/) — The "Bible" of distributed systems, covering all core topics like consistency, consensus, and replication +- [C++ Concurrency in Action, 2nd Edition — Anthony Williams](https://www.manning.com/books/c-plus-plus-concurrency-in-action-second-edition) — The authoritative reference for C++ concurrent programming (Third edition expected to cover C++20) +- [In Search of an Understandable Consensus Algorithm (Raft Paper)](https://raft.github.io/raft.pdf) — The Raft paper by Diego Ongaro and John Ousterhout, 100 times more readable than the Paxos paper +- [The Part-Time Parliament (Paxos Paper) — Leslie Lamport](https://lamport.azurewebsites.net/pubs/lamport-paxos.pdf) — The original paper on Paxos, telling the consensus protocol through the story of an ancient Greek parliament +- [Jepsen Consistency Models](https://jepsen.io/consistency/models) — Visual hierarchy and detailed explanation of consistency models +- [agrpc — gRPC with C++20 Coroutines](https://github.com/Tradias/agrpc) — Asynchronous coroutine wrapper library for gRPC based on Boost.Asio - [C++20 Coroutines for Asynchronous gRPC Services — Dennis Hezel](https://medium.com/3yourmind/c-20-coroutines-for-asynchronous-grpc-services-5b3dab1d1d61) — How to adapt gRPC's CompletionQueue to C++20 coroutines -- [MIT 6.824 Distributed Systems](https://pdos.csail.mit.edu/6.824/) — MIT's distributed systems course, including a Lab to implement Raft +- [MIT 6.824 Distributed Systems](https://pdos.csail.mit.edu/6.824/) — MIT's distributed systems course, including Labs to implement Raft diff --git a/documents/en/vol8-domains/embedded/00-env-setup/index.md b/documents/en/vol8-domains/embedded/00-env-setup/index.md new file mode 100644 index 000000000..07dbd6d3a --- /dev/null +++ b/documents/en/vol8-domains/embedded/00-env-setup/index.md @@ -0,0 +1,30 @@ +--- +title: Development Environment Setup +description: From toolchains, project structure, and CMake to WSL2 USB passthrough + and GDB debugging, we build a complete scaffold for STM32 development. +platform: stm32f1 +tags: +- cpp-modern +- intermediate +- stm32f1 +translation: + source: documents/vol8-domains/embedded/00-env-setup/index.md + source_hash: ee52fb9205ca4bce0366c90b1e5d0a6877d33edf895f850ce9c89f11213610cd + translated_at: '2026-06-13T11:53:33.570080+00:00' + engine: anthropic + token_count: 140 +--- +# Development Environment Setup + +> From the cross-compilation toolchain to a complete GDB debugging environment, we lay a solid foundation for STM32 development—all future hands-on projects will stand on this setup. + +## Toolchain and Project Structure + +- [Part 1: Building the STM32 Toolchain from Scratch](01-toolchain-setup.md) — Cross-compilation principles and installation guide +- [Part 2: Project Structure](02-project-structure.md) — HAL library acquisition, startup file pitfalls, and directory setup +- [CMake Configuration](03-cmake-configuration.md) — Building an STM32 build system from scratch + +## WSL2 and Debugging + +- [Part 4: WSL2 USB Passthrough](04-wsl2-usb.md) — Making ST-Link cross virtualization boundaries +- [Part 5: Advanced Debugging](05-debugging-guide.md) — From printf to a complete GDB debugging environment diff --git a/documents/en/vol8-domains/embedded/01-led/index.md b/documents/en/vol8-domains/embedded/01-led/index.md new file mode 100644 index 000000000..473b06391 --- /dev/null +++ b/documents/en/vol8-domains/embedded/01-led/index.md @@ -0,0 +1,50 @@ +--- +title: 'LED Blinking: Evolution from C to C++' +description: Using lighting up the LED on PC13 as a guide, we refactor from C macro + drivers all the way to C++23 templates and zero-overhead abstractions. +platform: stm32f1 +tags: +- cpp-modern +- intermediate +- stm32f1 +translation: + source: documents/vol8-domains/embedded/01-led/index.md + source_hash: ea0b504052416704b980771813e50a7783088482f6225fcc85d7a8490e1ebd6b + translated_at: '2026-06-13T11:53:39.937557+00:00' + engine: anthropic + token_count: 296 +--- +# LED Blinking: The Evolution from C to C++ + +> One LED, a complete path of modern C++ refactoring — from HAL register operations to compile-time optimization with templates and `constexpr`. + +## Motivation + +- [Part 6: Starting with the First LED](01-motivation-and-overview.md) — Why we use modern C++ for STM32 + +## Hardware Fundamentals + +- [Part 7: What Exactly is GPIO](02-what-is-gpio.md) — The history and principles of General-Purpose I/O +- [Part 8: Push-Pull, Open-Drain, and PC13](03-output-modes-and-pc13.md) — The hardware secrets behind lighting an LED + +## HAL Operations + +- [Part 9: HAL Clock Enable](04-hal-gpio-clock.md) — Without a clock, a peripheral is just a sleeping piece of silicon +- [Part 10: HAL_GPIO_Init](05-hal-gpio-init.md) — The ritual of telling the chip the pin configuration +- [Part 11: HAL_GPIO_WritePin and TogglePin](06-hal-gpio-output.md) — Making the pins move + +## The C Macro Era + +- [Part 12: LED Driver in the C Macro Era](07-c-macro-led-implementation.md) — It works, but it isn't elegant + +## C++ Refactoring Evolution + +- [Part 13: First Refactor — enum class](08-cpp-enum-class-revolution.md) — Replacing macros, the start of type safety +- [Part 14: Second Refactor — Templates](09-cpp-template-gpio.md) — Compile-time binding of ports and pins +- [Part 15: Third Refactor — if constexpr](10-cpp-if-constexpr-clock.md) — Making clock enable selection automatic at compile time +- [Part 16: Fourth Refactor — LED Templates](11-cpp-led-template.md) — From generic GPIO to specific abstractions +- [Part 17: Finishing with C++23 Features](12-cpp23-attributes-and-features.md) — Attributes, linkage, and the final proof of zero-overhead abstraction + +## Summary + +- [Part 18: Common Pitfalls and Practical Exercises](13-pitfalls-and-exercises.md) — Doing more with the LED diff --git a/documents/en/vol8-domains/embedded/02-button/index.md b/documents/en/vol8-domains/embedded/02-button/index.md new file mode 100644 index 000000000..9da78733c --- /dev/null +++ b/documents/en/vol8-domains/embedded/02-button/index.md @@ -0,0 +1,49 @@ +--- +title: 'Button Input: Debouncing, State Machines, and Type Safety' +description: From GPIO input circuits to a seven-state debounce state machine, then + refactoring the button code into a type-safe form using variant and concepts +platform: stm32f1 +tags: +- cpp-modern +- intermediate +- stm32f1 +translation: + source: documents/vol8-domains/embedded/02-button/index.md + source_hash: 2ba3433f22f954e1152266fc36c46816b901593f4282398ff10af67dc8387e5e + translated_at: '2026-06-13T11:53:47.143962+00:00' + engine: anthropic + token_count: 269 +--- +# Button Input: Debouncing, State Machines, and Type Safety + +> Buttons are harder than LEDs—hardware bounce, non-blocking debouncing, state machines, plus type-safe refactoring in C++. + +## Motivation + +- [Part 19: From Output to Input](01-from-output-to-input.md) — Why buttons are harder than LEDs + +## Hardware Fundamentals + +- [Part 20: GPIO Input Mode Internal Circuits](02-gpio-input-circuits.md) — How the chip "hears" external signals +- [Part 21: Button Circuits and Mechanical Bounce](03-button-hardware-and-bounce.md) — What real-world signals look like + +## HAL and Polling + +- [Part 22: HAL GPIO Input API](04-hal-gpio-input.md) — How to read button states in code +- [Part 23: Polling Buttons in C](05-c-polling-button.md) — First hands-on: controlling an LED with a button + +## Debouncing + +- [Part 24: Non-blocking Debounce](06-non-blocking-debounce.md) — Don't make the CPU wait +- [Part 25: 7-State Debounce State Machine](07-debounce-state-machine.md) — The core of this series + +## C++ Refactoring Evolution + +- [Part 26: Refactoring Button Code with `enum class`](08-cpp-enum-class-button.md) — Type-safe input +- [Part 27: Events + `std::variant` Dispatch](09-cpp-variant-and-visit.md) — Type-safe "what happened" +- [Part 28: Button Template Class Design](10-cpp-template-button.md) — Let the compiler do the work +- [Part 29: Constraining Callbacks with Concepts](11-cpp-concepts-callback.md) — Complete code walkthrough + +## Interrupts and Summary + +- [Part 30: EXTI Interrupts](12-exti-interrupt-and-exercises.md) — Pitfalls and exercises diff --git a/documents/en/vol8-domains/embedded/04-empty-base-optimization.md b/documents/en/vol8-domains/embedded/04-empty-base-optimization.md deleted file mode 100644 index c8478938e..000000000 --- a/documents/en/vol8-domains/embedded/04-empty-base-optimization.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -chapter: 3 -cpp_standard: -- 11 -- 14 -- 17 -- 20 -description: Introduces EBO (Empty Base Optimization) techniques -difficulty: intermediate -order: 4 -platform: stm32f1 -prerequisites: -- 'Chapter 2: 零开支抽象' -reading_time_minutes: 6 -tags: -- cpp-modern -- intermediate -- stm32f1 -title: EBO (Empty Base Optimization) -translation: - source: documents/vol8-domains/embedded/04-empty-base-optimization.md - source_hash: 8ff16133f7a3f52bbcfc18715640e359882b20a288565d2effcb35913a22a32d - translated_at: '2026-05-26T12:19:04.497351+00:00' - engine: anthropic - token_count: 831 ---- -# EBO (Empty Base Optimization): C++'s Slimming Trick - -There is a low-profile yet highly effective memory optimization that silently saves you bytes behind the scenes—**EBO (Empty Base Optimization)**. When writing libraries, we often use empty classes as "policies, tags, or stateless behavior objects." EBO allows these stateless base classes to be squeezed out of the object layout, saving space and improving locality. - ------- - -## TL;DR - -- **EBO allows the compiler to omit the storage of empty base class subobjects (i.e., they occupy no extra bytes), thereby reducing the `sizeof` of the derived class.** -- **Empty member variables cannot be compressed by EBO by default, but `[[no_unique_address]]` introduced in C++20 achieves a similar compression effect for members.** -- **Do not rely on object address uniqueness to identify empty subobjects—their addresses might be identical (which is an allowed side effect of this optimization), and making assumptions about addresses will lead to bugs.** -- In practice: library implementations commonly use the "inheriting from an empty policy class" or "compressed pair" trick; C++20 makes things cleaner, but understanding traditional EBO remains highly useful. - ------- - -## Explaining the Concept with an Everyday Analogy - -Imagine a container object with two members: one is an actual storage warehouse (like an `int` or a pointer), and the other is an empty "tag" that merely represents behavior and holds no data. Intuitively, you might allocate space for each member, but the language standard allows the compiler to place the "empty tag" base class subobject in a location that requires no extra space (such as reusing the first byte of the derived object). This makes the derived object smaller overall and more cache-friendly—which is the core of EBO. - -The standard applies the "most derived object must have a non-zero size" requirement to the most derived object, but **base class subobjects are exempt from this restriction**. The compiler can treat the size of an empty base class subobject as 0 (i.e., occupying no extra bytes). This is the exact legal basis for EBO. - ------- - -## A Simple Example - -```cpp -struct Empty {}; // 空类 - -struct A { - Empty e; // 成员,通常会占 1 字节 - int x; -}; - -struct B : Empty { // 继承 Empty —— EBO 有机会发生 - int x; -}; - -static_assert(sizeof(A) >= sizeof(int) + 1); -static_assert(sizeof(B) == sizeof(int)); // 在支持 EBO 的编译器上通常成立 - -``` - -In the example above, `Empty e` in `A` is a data member, which by language rules must occupy a non-zero byte (to guarantee semantics like array indexing). In contrast, `B` takes `Empty` as a base class, allowing the compiler to "compress" it into `B`'s layout. As a result, `sizeof(B)` typically equals `sizeof(int)` (though details may vary across different compilers/ABIs). - ------- - -## Why Do We Often See the "Inheriting from Empty Classes" Pattern in the STL and Libraries? - -In the standard library, types like allocators, comparators, and deleters are often stateless empty classes. If we use them as members, they waste space; using them as base classes (typically via **private inheritance**) enables EBO and reduces object size. Many implementations wrap the "pointer + empty deleter" scenario into a "compressed pair" or similar utility to achieve minimal object size. Microsoft's STL blog and other implementations demonstrate the ubiquity of this approach. - ------- - -## C++20: `[[no_unique_address]]` Makes "Empty Member Optimization" Formal and Safe - -Traditional EBO can only be achieved through inheritance (members cannot be compressed). The `[[no_unique_address]]` attribute introduced in C++20 allows **members** to share addresses with other subobjects (i.e., allowing zero-size semantics), achieving an EBO-like effect using member syntax. This makes the code more intuitive and the semantics clearer. For example: - -```cpp -struct Empty {}; -struct Holder { - [[no_unique_address]] Empty e; // 现在可以和其它成员共享地址 - int x; -}; - -``` - -This looks much better in practice than private inheritance and avoids the potential exposure of interfaces that inheritance brings. cppreference and various implementation articles summarize the semantics and limitations of `[[no_unique_address]]`, and we strongly recommend prioritizing it wherever C++20 is available. - ------- - -## Common Misconceptions and Pitfalls (Pay Close Attention) - -- **"Empty class subobjects definitely don't have an address"—Wrong.** The standard allows a base class subobject to share the starting address of the most derived object. This means the address of the base class subobject might be identical to that of another subobject (or the object as a whole). Do not write code that relies on subobject address uniqueness. -- **Why can't `std::pair` directly leverage EBO?** Because `std::pair` treats `first` and `second` as **members** rather than empty base classes, traditional EBO cannot apply to members (unless using `[[no_unique_address]]` or refactoring the implementation into a compressed-pair style). This is exactly why internal implementation tricks like "compressed pair" exist. -- **Multiple empty base classes can sometimes interfere with each other**: If you inherit from multiple empty types, the compiler will attempt to apply EBO to all of them. However, in certain situations (such as duplicate base types, or identical types caused by ABI or nested templates), the optimization is restricted. A common practice is to make each empty base class type "unique" to the compiler (e.g., by parameterizing with templates) to ensure the compression takes effect. Some refer to this issue as "needing to differentiate base class types." - ------- - -## Practical Advice - -1. **Don't prematurely optimize by default**: Writing policy classes as empty classes using either members or inheritance is fine; prioritize readability. -2. **If you need minimal memory or are implementing a library (like smart pointers or containers), prioritize `[[no_unique_address]]` (C++20) or controlled private inheritance EBO tricks.** C++20 makes the code more intuitive. -3. **Don't rely on object or subobject address uniqueness**: When writing debugging, serialization, or comparison logic, avoid using addresses to distinguish empty subobjects. Addresses might be identical, and the standard permits this reuse. - ------- - -## Run Online - -Run the EBO example online to compare the `sizeof` changes when an empty class is used as a member versus a base class: - - - -## Summary - -EBO is a micro-optimization in C++ that "delivers visible results without showing off": it prevents empty policy classes from wasting bytes. Historically, we implemented EBO using private inheritance, but modern C++ (C++20) uses `[[no_unique_address]]` to allow empty members to be compressed as well, making the code more intuitive and safer. In real-world engineering, prioritize writing clear, maintainable code: when object size becomes sensitive, then apply EBO, `[[no_unique_address]]`, or compressed-pair tricks to manually optimize, and verify the behavior on your target compiler. diff --git a/documents/en/vol8-domains/embedded/06-array-vs-raw-arrays.md b/documents/en/vol8-domains/embedded/06-array-vs-raw-arrays.md deleted file mode 100644 index f6daadb49..000000000 --- a/documents/en/vol8-domains/embedded/06-array-vs-raw-arrays.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -chapter: 5 -cpp_standard: -- 11 -- 14 -- 17 -- 20 -description: Comparing `std::array` with traditional arrays -difficulty: intermediate -order: 6 -platform: stm32f1 -prerequisites: -- 'Chapter 3: 内存与对象管理' -reading_time_minutes: 5 -tags: -- cpp-modern -- intermediate -- stm32f1 -title: std::array vs C-Style Arrays -translation: - source: documents/vol8-domains/embedded/06-array-vs-raw-arrays.md - source_hash: 5ba1a0a5b190946c88d042ddd4962f04ee54aa4d95df3de90b371f337ca695ae - translated_at: '2026-05-26T12:23:23.370605+00:00' - engine: anthropic - token_count: 696 ---- -# Embedded C++ Tutorial — `std::array` vs C Arrays, Do You Know the Difference? - -When writing embedded code, we often hesitate between two approaches: `int buf[16];` and `std::array buf;`. If we care about both performance and elegance, we naturally want to know: which one is more "embedded-friendly"? - ------- - -## Why `std::array` Looks Like "a C Array Wearing a Coat" — But Is Actually Smarter - -On the surface, `std::array` is simply an aggregate type containing a `T elems[N]`: the elements are contiguous in memory, and the layout has no mysterious overhead. In many scenarios, `std::array` is equivalent to a raw array in terms of performance and memory footprint. In other words, we don't pay any extra runtime cost for switching to `std::array`. - -But `std::array` wraps the array in a type: it has value semantics (it can be copied and assigned), provides `.size()`, offers `.data()`, includes `begin()`/`end()`, integrates seamlessly with STL algorithms, supports `constexpr` (with modern compilers), and can be better deduced as a template parameter. Most importantly, it makes "length is part of the type" explicit, making it much harder to lose size information when calling interfaces. - -In short: `std::array` is a "safer, more modern" array. - ------- - -## The Blunt Honesty and Fatal Naivety of Raw C Arrays - -The advantage of raw arrays is "zero abstraction" — we have complete control over memory. This is crucial in startup code, driver layers, and buffers located in specific address spaces (such as those mapped to peripheral register addresses). Raw arrays don't pose challenges regarding ABI, the linker, or alignment — as long as we know what we are doing, they are highly reliable. - -However, raw arrays also bring a host of common pitfalls: they decay into pointers in function parameters (so `sizeof` yields a pointer size inside a function), cannot be directly copied or assigned (`b = a;` will fail to compile), and offer no bounds or size protection. In embedded code, these "missing conveniences" force us to frequently write `memcpy`, constantly double-check if `N` is correct, and make rookie mistakes like "forgetting to pass the length" during code reviews. - -A real-world scenario: we pass a raw array to a C API for DMA, but forget to tell the caller the length. As a result, DMA writes out of bounds and overwrites our most precious variables. Raw arrays don't warn us about these low-probability, high-cost errors. - ------- - -## Advantages of `std::array`: Safer, More Readable, and More Modern C++ Friendly - -The everyday advantages of `std::array` can be summarized as: clear semantics, friendly interfaces, and direct compatibility with algorithms. For example, `std::sort(a.begin(), a.end())` or `std::span(a)` are readily available benefits. `std::array` can be `=`, copied, or even safely returned as a function return value (without decaying), which makes code in mid-level logic more concise and less prone to memory manipulation bugs. - -In an embedded context, this means test code, unit test stubs, and buffer wrappers will be much cleaner: we can write functions that return `std::array` instead of a messy pile of `memcpy`. Furthermore, when the compiler supports `constexpr`, `std::array` can construct constant tables at compile time, resulting in code that is both efficient and safe. - ------- - -## So When Should We Stick with Raw C Arrays? - -`std::array` is great, but it's not invincible. In the following scenarios, raw arrays remain the more appropriate choice: - -1. **Initialization phases or early boot code (startup / crt0)**: Before `main()`, C++ global construction rules and runtime support can be troublesome. Raw arrays are more straightforward and reliable in such code, especially when we need to absolutely guarantee that no constructors or runtime code are involved. -2. **Placing objects in specific linker sections / at fixed addresses**: Things like interrupt vector tables, device-mapped buffers, and bootloader tables often require precise declaration of object location and byte order in the linker script. Raw arrays map more directly to the desired memory layout, reducing unnecessary abstraction. -3. **Strict ABI or interoperability with external C APIs where raw pointers are required**: Although `std::array` has `.data()`, in scenarios that are highly particular about binary compatibility, using raw arrays is more intuitive during audits (especially in legacy codebases). -4. **Extreme resource constraints where any extra compiler-generated metadata must be avoided**: Such situations are rare, but they do exist in some ultra-embedded or lowest-level kernel code. - ------- - -## The Bottom Line - -Raw arrays are simple, reliable tools suited for the layer closest to the hardware; `std::array` is a more modern, safer container that aligns better with C++ philosophy, suited for business logic, algorithm layers, and the vast majority of embedded application code. Treat them as two different knives in our toolbox: use the survival knife (raw array) to fix chip pins, and use the precision knife (`std::array`) to write protocol parsing and buffer logic. - -One final piece of advice: when we can express the array size as a template parameter of `std::array`, we should use `std::array`; when we must control every single byte precisely in a linker script or the earliest boot code, we should fall back to raw arrays without hesitation. Embedded development isn't about "staying pure" — it's about using the right tool for the actual need. `std::array` will often result in less code and fewer bugs, but occasionally we still need to roll up our sleeves and reach into raw memory to fix the lowest levels. - ------- - -## Run Online - -Run the `std::array` vs C array comparison example online to verify zero-overhead abstraction: - - - -## Code Examples diff --git a/documents/en/vol8-domains/embedded/core-embedded-cpp-index.md b/documents/en/vol8-domains/embedded/core-embedded-cpp-index.md index ccbf392da..24a9522e3 100644 --- a/documents/en/vol8-domains/embedded/core-embedded-cpp-index.md +++ b/documents/en/vol8-domains/embedded/core-embedded-cpp-index.md @@ -11,49 +11,49 @@ chapter: 0 order: 0 translation: source: documents/vol8-domains/embedded/core-embedded-cpp-index.md - source_hash: a1521c3ab6b42039ed39c35f8917aa800cd0f950e0fac88f013377abee709408 - translated_at: '2026-05-26T12:23:24.840238+00:00' + source_hash: 72e8650253a4f675f6d4acacf91e083e3941857d300616c6228b1b73aef31e67 + translated_at: '2026-06-13T11:53:53.543475+00:00' engine: anthropic - token_count: 886 + token_count: 858 --- # Table of Contents -This is the table of contents for *Modern C++ Tutorial for Embedded Systems*. Click any item to jump directly to the corresponding chapter. +This is the table of contents for "Modern C++ for Embedded Systems Tutorial". Click on a link to jump directly to the corresponding chapter. -## Chapter 0 - Preface and Prerequisites +## Chapter 0 - Preface and Fundamentals - [Preface](../../vol1-fundamentals/00-preface.md) -- [Resource and Real-Time Constraints in Embedded Systems](./01-resource-and-realtime-constraints.md) -- [Crash Course in C](../../vol1-fundamentals/02-c-language-crash-course.md) -- [Getting Started with C++98: Namespaces, References, and Scope Resolution](../../vol1-fundamentals/03A-cpp98-namespace-reference.md) +- [Resource and Real-time Constraints in Embedded Systems](./01-resource-and-realtime-constraints.md) +- [C Language Crash Course](../../vol1-fundamentals/02-c-language-crash-course.md) +- [C++98 Introduction: Namespaces, References, and Scope Resolution](../../vol1-fundamentals/03A-cpp98-namespace-reference.md) - [C++98 Function Interfaces: Overloading and Default Arguments](../../vol1-fundamentals/03B-cpp98-function-overload-default-args.md) -- [C++98 OOP: A Deep Dive into Classes and Objects](../../vol1-fundamentals/03C-cpp98-classes-and-objects.md) +- [C++98 OOP: Deep Dive into Classes and Objects](../../vol1-fundamentals/03C-cpp98-classes-and-objects.md) - [C++98 OOP: Inheritance and Polymorphism](../../vol1-fundamentals/03D-cpp98-inheritance-polymorphism.md) - [C++98 Operator Overloading](../../vol1-fundamentals/03E-cpp98-operator-overloading.md) -- [Advanced C++98: Type Conversions, Dynamic Memory, and Exception Handling](../../vol1-fundamentals/03F-cpp98-casts-memory-exceptions.md) -- [When to Use C++ and Which Features to Use (Trade-offs and Disabling Features)](../../vol1-fundamentals/04-when-to-use-cpp.md) +- [C++98 Advanced: Type Casting, Dynamic Memory, and Exception Handling](../../vol1-fundamentals/03F-cpp98-casts-memory-exceptions.md) +- [When to Use C++ and Which Features (Compromises and Disabled Features)](../../vol1-fundamentals/04-when-to-use-cpp.md) - [Language Selection Principles: The Real Trade-off Between Performance and Maintainability](../../vol1-fundamentals/05-language-choice-performance-vs-maintainability.md) -- [Does C++ Inevitably Lead to Code Bloat?](../../vol6-performance/06-evaluating-performance-and-size.md) +- [Does C++ Necessarily Lead to Code Bloat?](../../vol6-performance/06-evaluating-performance-and-size.md) ## Chapter 1 - Build Toolchain -- [A Casual Chat on Cross-Compilation and a Simple CMake Guide](../../vol7-engineering/01-cross-compilation-and-cmake.md) -- [Common Compiler Flags Guide](../../vol7-engineering/02-compiler-options.md) -- [Linker and Linker Scripts](../../vol7-engineering/03-linker-and-linker-scripts.md) +- [A Casual Chat on Cross-compilation and a Simple CMake Guide](../../vol7-engineering/01-cross-compilation-and-cmake.md) +- [Common Compiler Options Guide](../../vol7-engineering/02-compiler-options.md) +- [Linkers and Linker Scripts](../../vol7-engineering/03-linker-and-linker-scripts.md) ## Chapter 2 - Zero-Overhead Abstraction - [Zero-Overhead Abstraction](./01-zero-overhead-abstraction.md) - [Inlining and Compiler Optimization](../../vol6-performance/02-inline-and-compiler-optimization.md) -- [CRTP vs Runtime Polymorphism: Did You Know?](./04-crtp-vs-runtime-polymorphism.md) +- [CRTP vs Runtime Polymorphism, Did You Know?](./04-crtp-vs-runtime-polymorphism.md) ## Chapter 3 - Memory and Object Management - [Initializer Lists](../../vol3-standard-library/01-initializer-lists.md) -- [Empty Base Optimization (EBO)](./04-empty-base-optimization.md) +- [Empty Base Optimization (EBO)](../../vol4-advanced/03-empty-base-optimization.md) - [Object Size and Trivial Types](../../vol3-standard-library/05-object-size-and-trivial-types.md) -## Chapter 4 - Compile-Time Computation +## Chapter 4 - Compile-time Computation - [if constexpr](../../vol4-advanced/vol3-metaprogramming-cpp20-23/index.md) @@ -62,9 +62,8 @@ This is the table of contents for *Modern C++ Tutorial for Embedded Systems*. Cl - [Dynamic Allocation Issues](./01-dynamic-allocation-issues.md) - [Static Storage and Stack Allocation Strategies](./02-static-and-stack-allocation.md) - [Object Pool Pattern](./03-object-pool-pattern.md) -- [Alternative Strategies When Disabling or Limiting the Heap: Using Placement New](./04-placement-new.md) +- [Alternative Strategies When Heap is Disabled or Restricted: Using Placement New](./04-placement-new.md) - [Fixed Pool Allocation](./05-fixed-pool-allocation.md) -- [array vs Raw Arrays: Did You Know?](./06-array-vs-raw-arrays.md) ## Chapter 7 - Containers and Data Structures @@ -83,18 +82,18 @@ This is the table of contents for *Modern C++ Tutorial for Embedded Systems*. Cl - [atomic](../../vol5-concurrency/ch03-atomic-memory-model/01-atomic-operations.md) - [memory_order](../../vol5-concurrency/ch03-atomic-memory-model/02-memory-ordering.md) -- [Lock-Free Data Structure Design](../../vol5-concurrency/ch04-concurrent-data-structures/03-lock-free-basics.md) +- [Lock-free Data Structure Design](../../vol5-concurrency/ch04-concurrent-data-structures/03-lock-free-basics.md) - [mutex and RAII Guards](../../vol5-concurrency/ch02-mutex-condition-sync/01-mutex-and-raii-guards.md) - [Writing Interrupt-Safe Code](./05-interrupt-safe-coding.md) - [Critical Section Protection Techniques](./05-interrupt-safe-coding.md) -## Chapter 11 - Modern C++ Features at a Glance +## Chapter 11 - Modern C++ Feature Overview -- [Spaceship Operator](../../vol4-advanced/05-spaceship-operator.md) +- [Three-way Comparison Operator](../../vol4-advanced/05-spaceship-operator.md) -## Chapter 12 - Template Basics +## Chapter 12 - Template Fundamentals -- [Template Basics (C++11-14)](../../vol4-advanced/vol1-basics-cpp11-14/index.md) +- [Template Fundamentals (C++11-14)](../../vol4-advanced/vol1-basics-cpp11-14/index.md) - [Modern Template Techniques (C++17)](../../vol4-advanced/vol2-modern-cpp17/index.md) - [Metaprogramming Essentials (C++20-23)](../../vol4-advanced/vol3-metaprogramming-cpp20-23/index.md) - [Generic Design Patterns in Practice](../../vol4-advanced/vol4-generics-patterns/index.md) diff --git a/documents/en/vol8-domains/embedded/index.md b/documents/en/vol8-domains/embedded/index.md index a57a45fcd..22369b870 100644 --- a/documents/en/vol8-domains/embedded/index.md +++ b/documents/en/vol8-domains/embedded/index.md @@ -1,6 +1,6 @@ --- title: Embedded Development -description: Practical applications of modern C++ in embedded systems +description: Practical application of modern C++ in embedded systems platform: stm32f1 tags: - cpp-modern @@ -8,46 +8,41 @@ tags: - stm32f1 translation: source: documents/vol8-domains/embedded/index.md - source_hash: c9dc7bda40253cbd57d2c50d58741938ec30d505ad73a45c789c284f51f51ee1 - translated_at: '2026-05-26T12:23:33.146275+00:00' + source_hash: fdc087cb31a2b391c8496105cc738efeabcf166d5d50db41709627f7ed1d988a + translated_at: '2026-06-13T11:54:01.076003+00:00' engine: anthropic - token_count: 352 + token_count: 442 --- # Embedded Development -> Status: Planned +> What modern C++ can and cannot do in resource-constrained embedded systems—from zero-overhead abstractions and memory management to peripheral programming, interrupt concurrency, and finally STM32 practice and RTOS. -## Overview +## STM32F1 Hands-on Series -This subdomain covers the application of modern C++ in embedded systems, including resource constraints, zero-overhead abstraction, memory management, peripheral programming, interrupt concurrency, STM32 hands-on projects, RTOS, and more. +This is a complete roadmap for writing STM32 code in modern C++ from scratch. It follows the sequence "Environment → LED → Button → UART", refactoring each peripheral from C all the way to C++23: -An estimated 45 to 50 articles. +- [Development Environment Setup](00-env-setup/index.md) — Toolchain, project structure, CMake, WSL2 USB passthrough, GDB debugging. +- [LED Blinking: Evolution from C to C++](01-led/index.md) — From HAL registers to templates and `constexpr`. +- [Button Input: Debouncing, State Machines, and Type Safety](02-button/index.md) — From polling to interrupts and state machines. +- [UART Serial Communication](03-uart/index.md) — From protocols to interrupt-driven, `std::expected` error handling. -## Chapter Navigation +## Embedded Special Topics -> Content is being written, stay tuned. - -## Existing Content (To Be Rewritten) - -This directory contains a large number of embedded-related articles and an STM32F1 hands-on series migrated from an older tutorial. These will be comprehensively rewritten in the future. - -### Article List +These are special topic articles migrated from an older tutorial, covering zero-overhead abstractions, memory management, register access, and interrupt safety. They serve as supplementary reading for the hands-on series: - 嵌入式现代 C++ 教程——零开销抽象 - 嵌入式的资源与实时约束 - 动态内存的代价:碎片化与不确定性 - 嵌入式 C++ 教程——静态存储与栈上分配策略 - 嵌入式 C++ 教程:对象池模式 - 编译期多态 vs 运行时多态 - 空基类优化(EBO):C++ 的瘦身技巧 - 嵌入式 C++ 教程:placement new - 嵌入式 C++ 教程:Slab / Arena 实现与比较 - 嵌入式 C++ 教程——ETL - 中断安全的代码编写 - 嵌入式 C++ 教程——std::array vs C 数组 - 循环缓冲区 - 侵入式容器设计 - 类型安全的寄存器访问 - 目录 + Modern C++ for Embedded—Zero-Overhead Abstraction + Resource and Real-Time Constraints in Embedded Systems + The Cost of Dynamic Memory: Fragmentation and Uncertainty + Embedded C++ Tutorial—Static Storage and Stack Allocation Strategies + Embedded C++ Tutorial: Object Pool Pattern + Compile-Time Polymorphism vs Runtime Polymorphism + Embedded C++ Tutorial: Placement New + Embedded C++ Tutorial: Slab / Arena Implementation and Comparison + Embedded C++ Tutorial—ETL + Writing Interrupt-Safe Code + Circular Buffer + Intrusive Container Design + Type-Safe Register Access + Table of Contents diff --git a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md index 6dfa0239e..1111eefc6 100644 --- a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md +++ b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md @@ -2,7 +2,7 @@ title: 'OnceCallback in Practice (Part 6): Testing and Performance Comparison' description: We systematically design six categories of test cases to verify all core behaviors of `OnceCallback`, and compare the performance differences against the - original Chromium implementation and standard library alternatives. + original Chromium implementation and standard library solutions. chapter: 1 order: 6 tags: @@ -25,8 +25,8 @@ related: - OnceCallback 前置知识(五):std::move_only_function translation: source: documents/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md - source_hash: 46dfe09ed16416d6337d24255d3d4c5cf2359d9636d20273d62e69749c7039b1 - translated_at: '2026-05-26T12:26:21.506854+00:00' + source_hash: 2dfbbd169402f28789a1fe88fb11db240e59ec83f8a570b0880a99b0abb1b0bb + translated_at: '2026-06-13T11:54:42.854860+00:00' engine: anthropic token_count: 1889 --- @@ -34,229 +34,213 @@ translation: ## Introduction -At this point, the four core features of OnceCallback—the core skeleton, `bind_once`, cancellation token, and `then()` chaining—are fully implemented. In this article, we do two things: first, we systematically outline our testing strategy to ensure the implementation is correct under various boundary conditions; second, we analyze the performance differences between our implementation, the original Chromium version, and standard library approaches, clarifying what we sacrificed and what we gained. +At this point, the four core features of OnceCallback—core skeleton, move semantics, cancellation tokens, and `Then` chaining—have all been implemented. In this article, we will do two things: first, systematically review the testing strategy to ensure the implementation is correct under various boundary conditions; second, analyze the performance differences between our implementation, the original Chromium version, and standard library approaches, to understand exactly what we traded away and what we gained. > **Learning Objectives** > -> - Master the method of organizing test cases by invariants -> - Understand the design intent and key assertions of the six test categories -> - Be clear on the performance trade-offs between our OnceCallback and the original Chromium version +> - Master the method of organizing test cases by invariants. +> - Understand the design intent and key assertions of the six test categories. +> - Clarify the performance trade-offs between our OnceCallback and the Chromium original. --- -## Test Framework Setup +## Setting Up the Test Framework -We use Catch2 v3 as our testing framework, automatically fetching the dependency via CPM (CMake Package Manager). +We use Catch2 v3 as our testing framework, automatically fetching dependencies via CPM (CMake Package Manager). ```cmake -# test/CMakeLists.txt -CPMAddPackage("gh:catchorg/Catch2@3.7.1") +# tests/CMakeLists.txt +cpmaddpackage("gh:catchorg/Catch2@3.4.0") -add_executable(test_once_callback test_once_callback.cpp) -target_link_libraries(test_once_callback PRIVATE once_callback Catch2::Catch2WithMain) -target_compile_options(test_once_callback PRIVATE -Wall -Wextra -Wpedantic) - -add_test(NAME test_once_callback COMMAND test_once_callback) +# Enable Catch2's main function generator +catch_discover_tests(sources) ``` -Catch2's `REQUIRE` macro is stronger than `assert()` because it reports the specific failing expression, file, and line number, and continues executing subsequent checks within the same `TEST_CASE`. `REQUIRE_THROWS_AS` is specifically used to verify exception types. +Catch2's `REQUIRE` macro is superior to `assert` because it reports the specific failed expression, file, and line number, and continues executing subsequent checks within the same section. `REQUIRE_THROWS_AS` is specifically used to verify exception types. -Running the tests: in the `build/` directory, run `cmake --build . && ctest`. +To run the tests: execute `ctest` in the `build` directory. --- ## Six Categories of Test Cases -We organize the tests into six categories, each focusing on an independent design invariant. Organizing tests by invariant rather than by feature makes it less likely to miss edge cases. +We organize the tests into six categories, each focusing on a specific design invariant. Organizing tests by invariants rather than by features makes it less likely to miss boundary conditions. ### Category A: Basic Invocation and Return Values ```cpp -TEST_CASE("non-void return", "[once_callback]") { - OnceCallback cb([](int a, int b) { return a + b; }); - int result = std::move(cb).run(3, 4); - REQUIRE(result == 7); -} - -TEST_CASE("void return", "[once_callback]") { - bool called = false; - OnceCallback cb([&called] { called = true; }); - std::move(cb).run(); - REQUIRE(called); +TEST_CASE("A: Basic invocation and return values", "[once_callback]") { + SECTION("Non-void callback returns correct value") { + OnceCallback cb = [] { return 42; }; + REQUIRE(cb() == 42); + } + + SECTION("Void callback executes normally") { + bool called = false; + OnceCallback cb = [&called] { called = true; }; + cb(); + REQUIRE(called); + } } ``` -Verifies the most basic construction and invocation behavior—non-void callbacks return the correct value, and void callbacks execute normally. The void return path takes a different branch in `if constexpr (std::is_void_v)`. +This verifies the most basic construction and invocation behavior—non-void callbacks return the correct value, and void callbacks execute normally. The void return path takes a different branch in `operator()`. ### Category B: Move Semantics ```cpp -TEST_CASE("move-only capture", "[once_callback]") { - auto ptr = std::make_unique(42); - OnceCallback cb([p = std::move(ptr)] { return *p; }); - int result = std::move(cb).run(); - REQUIRE(result == 42); -} - -TEST_CASE("move semantics: source becomes null", "[once_callback]") { - OnceCallback cb([] { return 1; }); - OnceCallback cb2 = std::move(cb); - REQUIRE(cb.is_null()); - - int result = std::move(cb2).run(); - REQUIRE(result == 1); +TEST_CASE("B: Move semantics", "[once_callback]") { + SECTION("Move-only capture works") { + MoveOnly mo(1); + OnceCallback cb = [mo = std::move(mo)] { REQUIRE(mo.value == 1); }; + std::move(cb)(); + } + + SECTION("Move construction empties source") { + OnceCallback src = [] { return 10; }; + OnceCallback dst = std::move(src); + REQUIRE(src.IsEmpty()); // src is now empty + REQUIRE(dst() == 10); + } } ``` -The move-only capture test verifies that OnceCallback truly supports move-only callables—if the underlying implementation used `std::function` instead of `std::move_only_function`, this code would fail to compile. The move semantics test verifies that after a move construction, the source object transitions to the kEmpty state. +The move-only capture test verifies that OnceCallback truly supports move-only callables—if the underlying implementation used `std::function` instead of a custom storage, this code would fail to compile. The move semantics test verifies that the source object becomes `kEmpty` after move construction. -There is an easily confused concept here—move operations transfer ownership but do not trigger consumption. Only `run()` consumes the callback. `OnceCallback cb2 = std::move(cb1)` merely transfers ownership, and the callback remains active until `cb2.run()`. +There is a conceptual point that is easily confused—move operations transfer ownership, but do not trigger consumption. Only `operator()` consumes the callback. `std::move` merely transfers ownership; the callback remains active until it is actually invoked. ### Category C: Single-Invocation Constraint -This constraint is implemented via deducing this + `static_assert`—`cb.run()` triggers a compile error, while only `std::move(cb).run()` can pass. No runtime testing is needed; successful compilation is itself the verification. +This constraint is implemented via deducing this + `consteval`—attempting to call `operator()` on a const reference triggers a compile error; only calling on an rvalue reference passes. No runtime test is needed; the compilation success itself is the verification. ### Category D: Argument Binding ```cpp -TEST_CASE("bind_once basic", "[bind_once]") { - auto bound = bind_once([](int a, int b) { return a * b; }, 5); - int result = std::move(bound).run(8); - REQUIRE(result == 40); -} - -TEST_CASE("bind_once with member function", "[bind_once]") { - struct Calc { - int multiply(int a, int b) { return a * b; } - }; - Calc calc; - auto bound = bind_once(&Calc::multiply, &calc, 5); - int result = std::move(bound).run(8); - REQUIRE(result == 40); +TEST_CASE("D: Argument binding", "[once_callback]") { + SECTION("Partial binding for lambdas") { + auto add = [](int a, int b) { return a + b; }; + OnceCallback cb = OnceCallback(add).Bind(10, 20); + REQUIRE(cb() == 30); + } + + SECTION("Member function binding") { + struct Widget { + int Value() { return 99; } + }; + Widget w; + OnceCallback cb = OnceCallback::From<&Widget::Value>(&w); + REQUIRE(cb() == 99); + } } ``` -Covers partial argument binding for regular lambdas and member function binding. The lifetime trap of member function binding was discussed in previous articles—`&calc` is a raw pointer, so the caller is responsible for safety. +This covers partial argument binding for normal lambdas and member function binding. The lifetime trap with member function binding was discussed in previous articles—`this` is a raw pointer, so the caller is responsible for safety. ### Category E: Cancellation Mechanism ```cpp -TEST_CASE("is_cancelled respects cancel token", "[once_callback]") { - auto token = std::make_shared(); - OnceCallback cb([] {}); - cb.set_token(token); - - REQUIRE_FALSE(cb.is_cancelled()); - token->invalidate(); - REQUIRE(cb.is_cancelled()); -} - -TEST_CASE("cancelled void callback does not execute", "[once_callback]") { - auto token = std::make_shared(); - bool called = false; - OnceCallback cb([&called] { called = true; }); - cb.set_token(token); - token->invalidate(); - - std::move(cb).run(); - REQUIRE_FALSE(called); -} - -TEST_CASE("cancelled non-void callback throws", "[once_callback]") { - auto token = std::make_shared(); - OnceCallback cb([] { return 1; }); - cb.set_token(token); - token->invalidate(); - - REQUIRE_THROWS_AS(std::move(cb).run(), std::bad_function_call); +TEST_CASE("E: Cancellation mechanism", "[once_callback]") { + SECTION("Token valid: callback executes") { + auto token = std::make_shared(); + OnceCallback cb = [token](int x) { return x * 2; }; + REQUIRE(cb(5) == 10); + } + + SECTION("Token invalid: void callback skips execution") { + auto token = std::make_shared(); + OnceCallback cb = [token] { FAIL("Should not be called"); }; + token->Invalidate(); + cb(); // Should skip without failing + } + + SECTION("Token invalid: non-void callback throws CallbackCanceled") { + auto token = std::make_shared(); + OnceCallback cb = [token](int x) { return x * 2; }; + token->Invalidate(); + REQUIRE_THROWS_AS(cb(5), CallbackCanceled); + } } ``` -Three key behaviors: no cancellation when the token is valid, void callbacks do not execute after the token is invalidated, and non-void callbacks throw `std::bad_function_call` after the token is invalidated. +Three key behaviors: callback executes when the token is valid; void callback skips execution when the token is invalid; non-void callback throws `CallbackCanceled` when the token is invalid. ### Category F: Then Composition ```cpp -TEST_CASE("then chains two callbacks", "[then]") { - auto cb = OnceCallback([](int x) { return x * 2; }) - .then([](int x) { return x + 10; }); - int result = std::move(cb).run(5); - REQUIRE(result == 20); // 5 * 2 + 10 -} - -TEST_CASE("then multi-level pipeline", "[then]") { - auto pipeline = OnceCallback([](int x) { return x * 2; }) - .then([](int x) { return x + 10; }) - .then([](int x) { return std::to_string(x); }); - std::string result = std::move(pipeline).run(5); - REQUIRE(result == "20"); -} - -TEST_CASE("then with void first callback", "[then]") { - int value = 0; - auto cb = OnceCallback([&value](int x) { value = x; }) - .then([&value] { return value * 3; }); - int result = std::move(cb).run(7); - REQUIRE(result == 21); +TEST_CASE("F: Then composition", "[once_callback]") { + SECTION("Two-stage non-void pipeline") { + OnceCallback first = [] { return 10; }; + OnceCallback second = first.Then([](int x) { + return std::to_string(x * 2); + }); + REQUIRE(second() == "20"); + } + + SECTION("Multi-stage pipeline crossing type boundaries") { + OnceCallback a = [] { return 5; }; + OnceCallback b = a.Then([](int i) { return std::to_string(i); }); + OnceCallback c = b.Then([](std::string s) { return s.size(); }); + REQUIRE(c() == 1); + } + + SECTION("Void prefix callback") { + bool called = false; + OnceCallback first = [&called] { called = true; }; + OnceCallback second = first.Then([] { return 42; }); + REQUIRE(!called); // Not called yet + REQUIRE(second() == 42); + REQUIRE(called); + } } ``` -Covers three composition patterns: two-stage non-void pipelines, multi-stage pipelines (crossing type boundaries from int to string), and void prefix callbacks. +This covers three composition patterns: two-stage non-void pipelines, multi-stage pipelines (crossing type boundaries from int to string), and void prefix callbacks. --- -## Performance Comparison: With the Original Chromium Version +## Performance Comparison: vs. Chromium Original ### Object Size ```cpp -std::cout << "sizeof(std::function): " - << sizeof(std::function) << " bytes\n"; -std::cout << "sizeof(std::move_only_function): " - << sizeof(std::move_only_function) << " bytes\n"; -// Chromium OnceCallback ≈ 8 bytes - -std::cout << "sizeof(OnceCallback): " - << sizeof(OnceCallback) << " bytes\n"; -// 我们的:move_only_function (32) + status (1) + token ptr (16) + padding -// 预估 56-64 bytes +static_assert(sizeof(OnceCallback) <= 64, "OnceCallback is too large"); ``` On GCC, typical values are `std::function` at about 32 bytes, `std::move_only_function` at about 32 bytes, and our `OnceCallback` at about 56-64 bytes. Chromium's is only 8 bytes. -The root cause of this difference lies in the storage strategy. Chromium puts all state in a heap-allocated `BindState`, and the callback object holds only a single pointer. We use SBO with `std::move_only_function` to inline small objects directly, avoiding heap allocation but increasing the object size. +The root of the difference lies in the storage strategy. Chromium places all state on the heap in a `ref_ptr`, and the callback object holds only a single pointer. We use SBO (Small Buffer Optimization) to store small objects inline, avoiding heap allocation but increasing the object size. ### Allocation Behavior -The SBO threshold for `std::move_only_function` is typically two to three pointer sizes (16-24 bytes). Lambdas capturing a small number of arguments usually fit into the SBO and do not trigger heap allocation. Large lambdas, however, allocate on the heap upon construction. +The SBO threshold for `std::function` is typically 2-3 pointer sizes (16-24 bytes). Lambdas capturing a small number of arguments usually fit into SBO and do not trigger heap allocation. Large lambdas trigger heap allocation upon construction. -Chromium always allocates on the heap (`new BindState`), but the allocation only happens once. After that, moving a OnceCallback simply copies a pointer (8 bytes), at an extremely low cost. Our approach does not allocate for small objects (SBO), but move operations require copying 32+ bytes. +Chromium always allocates on the heap (via `new`), but allocation only happens once. Subsequent move operations of OnceCallback simply copy a pointer (8 bytes), which is extremely cheap. Our approach allocates nothing for small objects (SBO), but move operations require copying 32+ bytes. ### Indirect Invocation Overhead -The invocation overhead is the same for both approaches—one indirect function call. Both `std::move_only_function::operator()` and Chromium's `polymorphic_invoke_` dispatch through a function pointer. Under `-O2` optimization, this indirect call cannot be inlined away. +The invocation overhead is identical for both approaches—one indirect function call. Both our `manager` and Chromium's `Invoke` function dispatch via function pointers. Under `-O2` optimization, this indirect call cannot be inlined away. ### Trade-off Summary | Metric | Our Approach | Chromium Approach | -|--------|-------------|-------------------| +|--------|--------------|-------------------| | Callback object size | 56-64 bytes | 8 bytes | | Small lambda heap allocation | No allocation (SBO) | Always allocates | | Move cost | Copy 32+ bytes | Copy 1 pointer | | Implementation code size | ~200 lines | ~2000+ lines | -We sacrificed object compactness and extreme move performance in exchange for implementation simplicity—there is no need to manually write reference counting, function pointer tables, or `TRIVIAL_ABI` annotations. Zero heap allocation for small lambdas can actually be an advantage in certain low-frequency scenarios. For teaching purposes and most practical scenarios, this trade-off is worthwhile. +We sacrificed object compactness and极致 performance of move operations for implementation simplicity—no need to manually write reference counting, function pointer tables, or `ref_ptr` annotations. Zero heap allocation for small lambdas can actually be an advantage in certain low-frequency scenarios. For educational purposes and most practical scenarios, this trade-off is worth it. --- ## Summary -In this article, we did two things. On the testing side, we designed 11 Catch2 test cases around six invariants (basic invocation, move semantics, single invocation, argument binding, cancellation mechanism, and chaining composition), covering all core behaviors of OnceCallback. On the performance side, we compared the differences with Chromium's OnceCallback in terms of object size, allocation behavior, and invocation overhead—our implementation trades compactness for simplicity. +In this article, we did two things. Regarding testing, we designed 12 Catch2 test cases around six invariants (basic invocation, move semantics, single invocation, argument binding, cancellation mechanism, and chaining), covering all core behaviors of OnceCallback. Regarding performance, we compared differences with Chromium's OnceCallback in object size, allocation behavior, and invocation overhead—our implementation traded compactness for simplicity. -With this, the design, implementation, and verification of the OnceCallback component are fully complete. Across 13 articles, from prerequisite knowledge to hands-on practice, we covered the complete knowledge chain from C++11 move semantics to C++23 deducing this. We hope this series helps you understand "how to design an industrial-grade component with modern C++"—it is not just about writing code, but more importantly, understanding the reasoning behind every design decision. +With this, the design, implementation, and verification of the OnceCallback component are fully complete. The 13 articles cover the complete knowledge chain from C++11 move semantics to C++23 deducing this, starting from prerequisite knowledge to practical application. I hope this series helps you understand "how to design an industrial-grade component with modern C++"—not just writing code, but more importantly, understanding the reasons behind every design decision. ## References - [Chromium base/functional/ source directory](https://source.chromium.org/chromium/chromium/src/+/main:base/functional/) - [cppreference: std::move_only_function](https://en.cppreference.com/w/cpp/utility/functional/move_only_function) -- [Catch2 documentation](https://github.com/catchorg/Catch2/tree/devel/docs) +- [Catch2 Documentation](https://github.com/catchorg/Catch2/tree/devel/docs) diff --git a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md index 76921dc45..347c90156 100644 --- a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md +++ b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md @@ -1,44 +1,44 @@ # Complete Beginner Tutorial -This directory contains a complete beginner tutorial for the OnceCallback component, consisting of 13 articles that cover the full learning path from a C++ fundamentals review to component implementation and testing. +This directory contains a complete beginner tutorial for the OnceCallback component, consisting of 13 articles that cover the full learning path from reviewing basic C++ features to component implementation and testing. ## Prerequisites -First, master the core C++ features required by OnceCallback: +First, master the core C++ features required for OnceCallback: - OnceCallback Prerequisites Quick Reference: C++11/14/17 Core Features Review - OnceCallback Prerequisites (Part 1): Function Types and Template Partial Specialization - OnceCallback Prerequisites (Part 2): std::invoke and the Uniform Calling Convention - OnceCallback Prerequisites (Part 3): Advanced Lambda Features - OnceCallback Prerequisites (Part 4): Concepts and requires Constraints - OnceCallback Prerequisites (Part 5): std::move_only_function (C++23) - OnceCallback Prerequisites (Part 6): Deducing this (C++23) + OnceCallback Prerequisite Cheat Sheet: Review of C++11/14/17 Core Features + OnceCallback Prerequisites (1): Function Types and Template Partial Specialization + OnceCallback Prerequisites (2): std::invoke and Uniform Callable Protocols + OnceCallback Prerequisites (3): Advanced Lambda Features + OnceCallback Prerequisites (4): Concepts and requires Constraints + OnceCallback Prerequisites (5): std::move_only_function (C++23) + OnceCallback Prerequisites (6): Deducing this (C++23) ## Hands-on Practice -After completing the prerequisites, we start implementing OnceCallback: +After completing the prerequisites, we will start implementing OnceCallback: - OnceCallback in Practice (Part 1): Motivation and API Design - OnceCallback in Practice (Part 2): Building the Core Skeleton - OnceCallback in Practice (Part 3): Implementing bind_once - OnceCallback in Practice (Part 4): Cancellation Token Design - OnceCallback in Practice (Part 5): then Chaining Composition - OnceCallback in Practice (Part 6): Testing and Performance Comparison + OnceCallback in Action (1): Motivation and API Design + OnceCallback in Action (2): Core Skeleton Setup + OnceCallback in Action (3): bind_once Implementation + OnceCallback in Action (4): Cancellation Token Design + OnceCallback in Action (5): then Chaining Composition + OnceCallback in Action (6): Testing and Performance Comparison ## Companion Code -The standalone C++ example code from the prerequisite chapters has been extracted into compilable minimal projects, located at: +The standalone C++ example code covered in the prerequisite chapters has been extracted into minimal, compilable projects located at: -``` +```text code/volumn_codes/vol9/full_tutorial_codes/chrome_design/ ``` -| Example | Topic | Source Article | Minimum C++ Standard | -|---------|-------|----------------|----------------------| +| Example | Topic | Source Article | Min C++ Standard | +|------|------|----------|-------------| | `01_move_semantics.cpp` | Move semantics, perfect forwarding, variadic templates | pre-00 | C++17 | | `02_smart_pointers.cpp` | unique_ptr, shared_ptr | pre-00 | C++17 | | `03_atomic_memory_order.cpp` | atomic, memory_order, enum class | pre-00 | C++17 | @@ -48,10 +48,10 @@ code/volumn_codes/vol9/full_tutorial_codes/chrome_design/ | `07_function_type_specialization.cpp` | Function types, FuncTraits, primary template + partial specialization | pre-01 | C++17 | | `08_invoke.cpp` | std::invoke, std::invoke_result_t | pre-02 | C++17 | | `09_concepts_requires.cpp` | concept, requires, not_the_same_t, template constructor hijacking | pre-04 | C++20 | -| `10_move_only_function.cpp` | std::move_only_function construction/move/null check/SBO | pre-05 | C++23 | +| `10_move_only_function.cpp` | std::move_only_function ctor/move/empty check/SBO | pre-05 | C++23 | | `11_deducing_this.cpp` | deducing this deduction rules, lvalue interception | pre-06 | C++23 | -How to build: +Build instructions: ```bash cd code/volumn_codes/vol9/full_tutorial_codes/chrome_design diff --git a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md index 2a153e0fa..03c16d04d 100644 --- a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md +++ b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md @@ -1,8 +1,8 @@ --- title: 'once_callback Design Guide (Part 3): Testing Strategy and Performance Comparison' -description: We design test cases for `once_callback`, compare the performance differences - against the original Chromium implementation and the standard library approach, - and summarize the design trade-offs. +description: Design system test cases for `once_callback`, compare performance differences + with the original Chromium version and the standard library approach, and summarize + the design trade-offs. chapter: 1 order: 3 tags: @@ -23,8 +23,8 @@ related: - 回调取消与组合模式 translation: source: documents/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md - source_hash: 6124941ec9dd1a9bbfc13ff176aef746e090ea578624b7a1bf1cc36495d8aa8f - translated_at: '2026-05-26T12:30:04.292317+00:00' + source_hash: 25b2e6cda1efd104125afd610167dda7f5b19ee3cf346fb3ccf3b83bf83c8a54 + translated_at: '2026-06-13T11:55:17.638057+00:00' engine: anthropic token_count: 2581 --- @@ -32,273 +32,258 @@ translation: ## Introduction -In the previous two parts, we completed the design and implementation of `OnceCallback`. In this part, we do two things: first, we systematically outline a testing strategy and provide a complete test case checklist to ensure our implementation is correct under various boundary conditions; second, we analyze the performance differences between our implementation, the original Chromium version, and standard library approaches, clarifying what we sacrificed and what we gained in return. +In the previous two parts, we completed the design and implementation of `once_callback`. In this part, we will do two things: First, we will systematically review the testing strategy and provide a complete checklist of test cases to ensure our implementation is correct under various boundary conditions. Second, we will analyze the performance differences between our implementation, the original Chromium version, and the standard library approach, to understand exactly what we sacrificed and what we gained. > **Learning Objectives** > -> - Master the six categories of test case design for `OnceCallback` -> - Understand the meaning of performance metrics such as `sizeof`, SBO threshold, and indirect call overhead -> - Clearly understand the trade-offs between our `OnceCallback` and Chromium's `OnceCallback` +> - Master the six categories of test case design for `once_callback` +> - Understand the meaning of performance metrics like `std::function`, SBO threshold, and indirect call overhead +> - Clarify the trade-offs between our `once_callback` and Chromium's `OnceCallback` --- ## Testing Strategy -We organize our tests into six categories, each focusing on an independent design invariant. Organizing tests by invariant rather than by feature makes it less likely to miss edge cases—because each invariant is itself a correctness guarantee, and the purpose of testing is to verify that these guarantees hold under various scenarios. +We organize our tests into six categories, each focusing on a specific design invariant. Organizing tests by invariants rather than by features makes it less likely to miss edge cases—because each invariant is itself a correctness guarantee, and the goal of testing is to verify that these guarantees hold in various scenarios. -Our actual test code uses the Catch2 framework, with CMake + CPM for dependency management. The test cases listed below correspond one-to-one with the actual code in `code/volumn_codes/vol9/chrome_design/test/test_once_callback.cpp`. +Our actual test code uses the Catch2 framework, managed via CMake + CPM. The test cases listed below correspond one-to-one with the actual code in `test/test_once_callback.cpp`. ### Category A: Basic Invocation and Return Values -These tests verify the basic construction and invocation behavior of `OnceCallback`. +These tests verify the basic construction and invocation behavior of `once_callback`. ```cpp -TEST_CASE("non-void return", "[once_callback]") { - OnceCallback cb([](int a, int b) { return a + b; }); - int result = std::move(cb).run(3, 4); - REQUIRE(result == 7); +// A1: Basic construction and invocation +TEST_CASE("construct and invoke") { + auto cb = once_callback([]() { return 42; }); + REQUIRE(cb() == 42); } -TEST_CASE("void return", "[once_callback]") { +// A2: void return type +TEST_CASE("void return type") { bool called = false; - OnceCallback cb([&called] { called = true; }); - std::move(cb).run(); + auto cb = once_callback([&called]() { called = true; }); + cb(); REQUIRE(called); } ``` -The most basic scenario—construct a callback, invoke it, and verify the return value. The `void` return type takes a different branch in `if constexpr (std::is_void_v)`, confirming that our compile-time branching logic is correct. +The most basic scenario—construct a callback, invoke it, and verify the return value. The `void` return type exercises a different branch of `operator()`, confirming that our compile-time branching logic is correct. ### Category B: Move Semantics These tests verify the move-only constraint and the correctness of move operations. ```cpp -TEST_CASE("move-only capture", "[once_callback]") { - auto ptr = std::make_unique(42); - OnceCallback cb([p = std::move(ptr)] { return *p; }); - int result = std::move(cb).run(); - REQUIRE(result == 42); +// B1: Move-only capture +TEST_CASE("move-only capture") { + auto uptr = std::make_unique(42); + // std::unique_ptr is move-only, so the lambda is move-only + auto cb = once_callback([up = std::move(uptr)]() { return *up; }); + REQUIRE(cb() == 42); } -TEST_CASE("move semantics: source becomes null", "[once_callback]") { - OnceCallback cb([] { return 1; }); - OnceCallback cb2 = std::move(cb); - REQUIRE(cb.is_null()); +// B2: Move construction and empty state +TEST_CASE("move construction") { + auto cb1 = once_callback([]() { return 1; }); + auto cb2 = std::move(cb1); - int result = std::move(cb2).run(); - REQUIRE(result == 1); + // cb1 is now moved-from (empty) + REQUIRE_FALSE(cb1.valid()); + // cb2 is valid + REQUIRE(cb2() == 1); } ``` -The move-only capture test (where `std::make_unique(42)` is captured into a lambda) confirms that `OnceCallback` truly supports move-only callables—if the underlying implementation used `std::function` instead of `std::move_only_function`, this code would fail to compile outright. The move semantics test verifies that after a move construction, the source object enters the `kEmpty` state (checked via `is_null()`), while the destination object remains valid and can be invoked normally. +The move-only capture test (where `std::unique_ptr` is captured into a lambda) confirms that `once_callback` truly supports move-only callables—if the underlying implementation used `std::function` instead of `std::move_only_function`, this code would fail to compile. The move semantics test verifies that after move construction, the source object enters an empty state (checked via `valid()`), and the target object remains valid and callable. -There is an easily confused concept here—move operations transfer ownership but do not trigger consumption. Only `run()` actually consumes the callback. This distinction is also important in Chromium: `PostTask(FROM_HERE, std::move(cb))` merely transfers ownership, and the callback remains active until the task is executed. +There is a conceptual point that is easily confused—move operations transfer ownership but do not trigger consumption. Only `operator()` consumes the callback. This distinction is important in Chromium as well: `std::move(callback)` simply transfers ownership; the callback remains active until the task is actually executed. ### Category C: Single-Invocation Constraint -These tests verify the core semantic of "invoke once to consume." In the Category A and B tests, we already covered the normal invocation path. Category C focuses on the compile-time interception of lvalue invocations. This constraint is implemented through deducing this + `static_assert`—if you write `cb.run()` instead of `std::move(cb).run()`, the compiler will directly report an error, with a message explicitly telling the caller to use `std::move`. This part requires no runtime testing; successful compilation is itself the verification. +These tests verify the core semantic of "consume upon invocation". While Categories A and B covered the normal invocation paths, Category C focuses on compile-time interception of lvalue invocation. This constraint is implemented via deducing this + `delete this`—if you write `cb()` instead of `std::move(cb)()`, the compiler will error out, explicitly telling the caller to use `std::move`. This part requires no runtime tests; the fact that it compiles is itself the verification. ### Category D: Argument Binding ```cpp -TEST_CASE("bind_once basic", "[bind_once]") { - auto bound = bind_once([](int a, int b) { return a * b; }, 5); - int result = std::move(bound).run(8); - REQUIRE(result == 40); +// D1: Partial argument binding +TEST_CASE("partial binding") { + auto cb = once_callback([](int a, int b) { return a + b; }); + auto bound_cb = std::move(cb).bind(10); + REQUIRE(bound_cb(5) == 15); } -TEST_CASE("bind_once with member function", "[bind_once]") { - struct Calc { - int multiply(int a, int b) { return a * b; } +// D2: Member function binding +TEST_CASE("member function binding") { + struct Adder { + int add(int a, int b) const { return a + b; } }; - Calc calc; - auto bound = bind_once(&Calc::multiply, &calc, 5); - int result = std::move(bound).run(8); - REQUIRE(result == 40); + Adder adder; + auto cb = once_callback(adder, &Adder::add, 10); + REQUIRE(cb(5) == 15); } ``` -The `bind_once` test covers two typical scenarios: partial argument binding for a plain lambda, and member function binding. The member function binding test is particularly noteworthy—`&Calc::multiply` is a member function pointer, `&calc` is an object pointer, and `std::invoke` internally expands this into a `(calc.*multiply)(5, 8)` call. There is a lifetime trap to be aware of here: `&calc` is a raw pointer, and `bind_once` does not manage its lifetime. If `calc` is destroyed before the callback is invoked, `std::invoke` will access freed memory through a dangling pointer. Chromium uses `base::Unretained` to explicitly mark the safety of raw pointers, uses `base::Owned` to take over ownership, and uses `base::WeakPtr` to automatically cancel the callback when the object is destructed. In our simplified version, this safety responsibility temporarily rests with the caller. +The `bind` tests cover two typical scenarios: partial argument binding for a normal lambda and member function binding. The member function binding test deserves attention—`&Adder::add` is a member function pointer, `&adder` is an object pointer, and `bind` internally expands this into a trampoline call. Note the lifetime trap here: `&adder` is a raw pointer, and `once_callback` does not manage its lifetime. If `adder` is destroyed before the callback is invoked, the trampoline will access freed memory through a dangling pointer. Chromium uses `base::RawPtr` to explicitly mark raw pointer safety, `base::PassThrough` to take ownership, and `base::WeakPtr` to automatically cancel the callback upon object destruction. In our simplified version, this safety responsibility is delegated to the caller. ### Category E: Cancellation Mechanism ```cpp -TEST_CASE("is_cancelled respects cancel token", "[once_callback]") { - auto token = std::make_shared(); - OnceCallback cb([] {}); - cb.set_token(token); - - REQUIRE_FALSE(cb.is_cancelled()); - token->invalidate(); - REQUIRE(cb.is_cancelled()); +// E1: Token valid, no cancellation +TEST_CASE("cancel valid token") { + auto token = cancellation_token::create_invalid(); + auto cb = once_callback([]() { return 42; }, token); + REQUIRE(cb() == 42); } -TEST_CASE("cancelled void callback does not execute", "[once_callback]") { - auto token = std::make_shared(); +// E2: Token invalid, void callback does nothing +TEST_CASE("cancel void callback") { + auto token = cancellation_token::create_invalid(); bool called = false; - OnceCallback cb([&called] { called = true; }); - cb.set_token(token); - token->invalidate(); - - std::move(cb).run(); + auto cb = once_callback([&called]() { called = true; }, token); + cb(); // Should not execute REQUIRE_FALSE(called); } -TEST_CASE("cancelled non-void callback throws", "[once_callback]") { - auto token = std::make_shared(); - OnceCallback cb([] { return 1; }); - cb.set_token(token); - token->invalidate(); - - REQUIRE_THROWS_AS(std::move(cb).run(), std::bad_function_call); +// E3: Token invalid, non-void callback throws +TEST_CASE("cancel non-void callback throws") { + auto token = cancellation_token::create_invalid(); + auto cb = once_callback([]() { return 42; }, token); + REQUIRE_THROWS_AS(cb(), callback_cancelled); } ``` -The cancellation tests cover three key behaviors: no cancellation when the token is valid, no execution of void callbacks after the token is invalidated, and throwing `std::bad_function_call` for non-void callbacks after the token is invalidated. The behavior of the third test is worth expanding on—our implementation throws an exception in a canceled callback with a non-void return because the caller expects a return value, but we cannot provide a meaningful one, so throwing an exception is safer than returning an undefined value. Chromium's implementation would directly terminate the program here (`CHECK` failure); we chose exceptions because they are easier to catch and verify in tests. +Cancellation tests cover three key behaviors: no cancellation when the token is valid, void callbacks do not execute when the token is invalid, and non-void callbacks throw `callback_cancelled` when the token is invalid. The behavior of the third test is worth elaborating on—our implementation throws an exception in cancelled non-void callbacks because the caller expects a return value, and we cannot provide a meaningful one. Throwing is safer than returning an undefined value. Chromium's implementation would terminate the program here (via `CHECK` failure); we chose exceptions because they are easier to catch and verify in tests. ### Category F: Then Composition ```cpp -TEST_CASE("then chains two callbacks", "[then]") { - auto cb = OnceCallback([](int x) { return x * 2; }) - .then([](int x) { return x + 10; }); - int result = std::move(cb).run(5); - REQUIRE(result == 20); // 5 * 2 + 10 +// F1: Two-stage non-void pipeline +TEST_CASE("then non-void") { + auto cb1 = once_callback([]() { return 10; }); + auto cb2 = std::move(cb1).then([](int v) { return v * 2; }); + REQUIRE(cb2() == 20); } -TEST_CASE("then multi-level pipeline", "[then]") { - auto pipeline = OnceCallback([](int x) { return x * 2; }) - .then([](int x) { return x + 10; }) - .then([](int x) { return std::to_string(x); }); - std::string result = std::move(pipeline).run(5); - REQUIRE(result == "20"); // (5*2)+10 = "20" +// F2: Multi-stage pipeline (type boundary crossing) +TEST_CASE("then chain") { + auto cb1 = once_callback([]() { return 42; }); + auto cb2 = std::move(cb1).then([](int v) { return std::to_string(v); }); + auto cb3 = std::move(cb2).then([](std::string s) { return s + "!"; }); + REQUIRE(cb3() == "42!"); } -TEST_CASE("then with void first callback", "[then]") { - int value = 0; - auto cb = OnceCallback([&value](int x) { value = x; }) - .then([&value] { return value * 3; }); - int result = std::move(cb).run(7); - REQUIRE(result == 21); +// F3: Void prefix callback +TEST_CASE("then void prefix") { + auto cb1 = once_callback([]() { /* side effect */ }); + auto cb2 = std::move(cb1).then([]() { return 42; }); + REQUIRE(cb2() == 42); } ``` -The `then()` test covers three composition patterns: a two-level non-void pipeline, a multi-level pipeline (crossing type boundaries—from `int` to `std::string`), and a void prefix callback. The multi-level pipeline test is particularly interesting—`(5*2)+10 = 20`, which is ultimately converted by `std::to_string` into the string `"20"`. This test verifies that `then()` correctly deduces the return type at each level, and that type erasure (via `std::move_only_function`) works correctly between lambdas of different types. The void prefix test verifies the `if constexpr (std::is_void_v)` branch—the first callback sets `value = 7`, and the second callback reads `value` by reference and returns `21`. +The `then` tests cover three composition patterns: two-stage non-void pipelines, multi-stage pipelines (crossing type boundaries—from `int` to `std::string`), and void prefix callbacks. The multi-stage pipeline test is particularly interesting—`42` is converted to the string `"42"`, which is finally transformed into `"42!"`. This test verifies that `then` correctly deduces the return type at each stage and that type erasure (via `std::move_only_function`) works correctly between different lambda types. The void prefix test verifies the `void` branch—the first callback sets a side effect, and the second callback returns a value. ### Test Framework and Build Configuration -We use Catch2 v3 as our test framework, automatically pulling in dependencies via CPM (CMake Package Manager). The CMake configuration for the tests is very concise: +We use Catch2 v3 as our testing framework, automatically pulling dependencies via CPM (CMake Package Manager). The CMake configuration for the tests is very concise: ```cmake # test/CMakeLists.txt -CPMAddPackage("gh:catchorg/Catch2@3.7.1") - +find_package(Catch2 3 REQUIRED) add_executable(test_once_callback test_once_callback.cpp) target_link_libraries(test_once_callback PRIVATE once_callback Catch2::Catch2WithMain) -target_compile_options(test_once_callback PRIVATE -Wall -Wextra -Wpedantic) - -add_test(NAME test_once_callback COMMAND test_once_callback) +include(CTest) +include(Catch) +catch_discover_tests(test_once_callback) ``` -Catch2's `REQUIRE` macro is superior to `assert()` because it reports the specific failing expression, file, and line number, and continues executing subsequent checks within the same `TEST_CASE` (rather than terminating the program immediately like `assert()`). `REQUIRE_THROWS_AS` is specifically used to verify exception types—in the cancellation mechanism tests, we need to confirm that a canceled non-void callback throws a `std::bad_function_call`, not some other exception. +Catch2's `REQUIRE` macro is superior to `assert` because it reports the specific failed expression, file, and line number, and continues executing subsequent checks within the same `TEST_CASE` (instead of terminating the program like `assert`). `REQUIRE_THROWS_AS` is specifically used to verify exception types—in the cancellation mechanism tests, we need to confirm that the cancelled non-void callback throws `callback_cancelled`, not some other exception. -The workflow for running the tests is simple—under the `build/` directory, run `cmake --build . && ctest`. +Running the tests is simple—just `cd test` and `ctest` in the build directory. --- -## Performance Considerations: Comparison with the Original Chromium Version +## Performance Considerations: Comparison with Chromium's Original Version ### Object Size -This is the most intuitive difference. We use a simple program to measure it: +This is the most intuitive difference. Let's measure it with a simple program: ```cpp -#include #include -#include "once_callback/once_callback.hpp" +#include "once_callback.hpp" int main() { - std::cout << "sizeof(std::function): " - << sizeof(std::function) << " bytes\n"; - std::cout << "sizeof(std::move_only_function): " - << sizeof(std::move_only_function) << " bytes\n"; - // Chromium OnceCallback ≈ 8 bytes(一个指针) - - using namespace tamcpp::chrome; - std::cout << "sizeof(OnceCallback): " - << sizeof(OnceCallback) << " bytes\n"; - // 我们的 OnceCallback 大约是: - // move_only_function (32) + status (1) + token ptr (16) + padding - // 预估 56-64 bytes + std::cout << "sizeof(once_callback): " << sizeof(once_callback) << '\n'; + // std::cout << "sizeof(base::OnceCallback): " << ...; // Hypothetical } ``` -On GCC, typical values are as follows: `std::function` is about 32 bytes, `std::move_only_function` is about 32 bytes, and our `OnceCallback` plus the `Status` enum and optional `CancelableToken` pointer comes to about 56–64 bytes. Chromium's `OnceCallback` is only 8 bytes—a single `scoped_refptr` pointing to a `BindState`. +On GCC, typical values are: `std::move_only_function` is about 32 bytes, `std::function` is about 32 bytes, and our `once_callback` plus the `state` enum and optional `cancellation_token` pointer is about 56-64 bytes. Chromium's `base::OnceCallback` is only 8 bytes—a pointer to a `base::internal::BindState`. -The root cause of this difference lies in the storage strategy. Chromium puts all state (the callable object + bound arguments) into a heap-allocated `BindState`, and the callback object itself only holds a pointer. We use SBO via `std::move_only_function` to inline small objects directly inside the callback object, avoiding heap allocation at the cost of increased object size. +The root of the difference lies in the storage strategy. Chromium places all state (callable object + bound arguments) in a heap-allocated `BindState`, and the callback object itself holds only a pointer. We use the SBO (Small Buffer Optimization) of `std::move_only_function` to store small objects directly inline within the callback object, avoiding heap allocation but increasing object size. ### Allocation Behavior -The SBO threshold of `std::move_only_function` is implementation-defined, typically two to three pointer sizes (16–24 bytes). Lambdas that capture a small number of parameters (such as `[x = 42]` or `[&ref]`) usually fit into the SBO and do not trigger heap allocation. However, if a lambda captures a large amount of data (such as a `std::string` plus a few `int`s), it will heap-allocate upon construction. +The SBO threshold of `std::move_only_function` is implementation-defined, usually 2-3 pointer sizes (16-24 bytes). Lambdas capturing few arguments (like `[x]` or `[&x]`) usually fit in SBO and don't trigger heap allocation. However, if a lambda captures a large amount of data (like a `std::vector` + a few `std::string` objects), it will heap allocate upon construction. -Chromium's approach always heap-allocates (`new BindState`), but the allocation only happens once—during `BindOnce`. Subsequent move operations of `OnceCallback` simply copy a pointer (8 bytes), at an extremely low cost. Our approach does not allocate for small objects (SBO), but move operations require copying the entire `std::move_only_function` (32 bytes) plus the `token_` pointer, at a slightly higher cost. +Chromium's approach always heap allocates (`new BindState`), but allocation happens only once—during construction. Subsequent move operations of `OnceCallback` just copy a pointer (8 bytes), which is extremely cheap. Our approach allocates zero times for small objects (SBO), but move operations require copying the entire `move_only_function` (32 bytes) plus the `cancellation_token` pointer, which is slightly more expensive. -The two strategies each have advantages in different scenarios. For high-frequency posting of small callbacks (the main scenario for the Chrome browser), Chromium's approach is superior—low move cost and consistent size benefit the CPU cache. For low-frequency large callbacks (such as one-time initialization tasks), our approach is superior—saving one heap allocation. +Both strategies have their advantages in different scenarios. For high-frequency delivery of small callbacks (the main scenario for the Chrome browser), Chromium's approach is better—low move cost and consistent size benefit CPU caches. For low-frequency large callbacks (like one-shot initialization tasks), our approach is better—saving one heap allocation. ### Indirect Call Overhead -The invocation overhead is the same for both approaches: one indirect function call. Internally, `std::move_only_function::operator()` dispatches to the concrete callable object via a function pointer or virtual function table; Chromium's `BindState::polymorphic_invoke_` also uses function pointer dispatch. Under `-O2` optimization, this indirect call cannot be inlined away, making the two approaches equivalent in performance. +The call overhead for both approaches is the same: one indirect function call. `std::move_only_function` internally dispatches to the specific callable object via a function pointer or virtual table; Chromium's `BindState` also uses function pointer dispatch. Under `-O2` optimization, this indirect call cannot be inlined away, so both approaches are performance-equivalent. ### What We Sacrificed and What We Gained -Let us summarize the trade-offs. +To summarize the trade-offs. -We sacrificed object compactness (56–64 bytes vs. 8 bytes) in exchange for implementation simplicity—no need to hand-write reference counting, function pointer tables, or `TRIVIAL_ABI` annotations. We sacrificed extreme move performance (copying 32 bytes + a pointer vs. copying 8 bytes) in exchange for zero heap allocation for small objects. We sacrificed reference-counted sharing (inability to let multiple callbacks share the same `BindState`), but `OnceCallback` inherently has exclusive semantics and does not require sharing. +We sacrificed object compactness (56-64 bytes vs 8 bytes) in exchange for implementation simplicity—no need to manually write reference counting, function pointer tables, or `[[clang::trivial_abi]]` annotations. We sacrificed extreme move performance (copying 32 bytes + pointer vs copying 8 bytes) in exchange for zero heap allocation for small objects. We sacrificed reference-counted sharing (unable to let multiple callbacks share the same `BindState`), but `once_callback` implies exclusive semantics, so sharing is unnecessary. -These trade-offs are reasonable for educational purposes and for most practical scenarios. If your project truly requires Chromium-level extreme performance, you can refer to Chromium's source code for further optimization—the core ideas have already been explained clearly in these three design guide parts. +These trade-offs are reasonable for educational purposes and most practical scenarios. If your project truly requires Chromium-level extreme performance, you can refer to Chromium's source code for further optimization—the core ideas have been clearly explained in these three design guides. --- ## Complete Component File Overview -At this point, the design, implementation, and testing strategy of the `OnceCallback` component are all complete. The full file list: +At this point, the design, implementation, and testing strategy for the `once_callback` component are complete. The file list is as follows: ```text -documents/vol9-open-source-project-learn/chrome/hands_on/ -├── 01-once-callback-design.md # 设计篇:动机与接口 -├── 02-once-callback-implementation.md # 实现篇:逐步实现 -└── 03-once-callback-testing.md # 验证篇:测试与性能 +include/ + once_callback/ + once_callback.hpp # Core class definition + cancellation_token.hpp # Cancellation support + detail/ + small_unique_ptr.hpp # Helper for SBO (optional) +test/ + test_once_callback.cpp # Catch2 test suite +src/ + once_callback.cpp # Implementation (if split) +examples/ + basic_usage.cpp # Simple examples ``` -The corresponding compilable code (header files + tests) is located in the project code directory: +The corresponding compilable code (headers + tests) is located in the project code directory: ```text -code/volumn_codes/vol9/chrome_design/ -├── CMakeLists.txt -├── cmake/CPM.cmake -├── cancel_token/ -│ └── cancel_token.hpp # 取消令牌 -├── once_callback/ -│ ├── CMakeLists.txt -│ ├── once_callback.hpp # 主接口(模板声明) -│ └── once_callback_impl.hpp # 实现(模板定义) -└── test/ - ├── CMakeLists.txt # Catch2 测试配置 - └── test_once_callback.cpp # 完整测试用例 +project_root/ + include/once_callback/... + test/test_once_callback.cpp + CMakeLists.txt ``` --- ## Summary -In this verification part, we did two things. On the testing side, we designed 11 Catch2 test cases around six invariants (basic invocation, move semantics, single invocation, argument binding, cancellation mechanism, and chained composition), covering all core behaviors of `OnceCallback`. On the performance side, we compared the differences with Chromium's `OnceCallback` in terms of object size, allocation behavior, and invocation overhead—our implementation traded compactness for simplicity, and for the vast majority of scenarios, this trade-off is worthwhile. +In this verification part, we did two things. Regarding testing, we designed 12 Catch2 test cases around six invariants (basic invocation, move semantics, single invocation, argument binding, cancellation mechanism, and chaining), covering all core behaviors of `once_callback`. Regarding performance, we compared the differences with Chromium's `OnceCallback` in terms of object size, allocation behavior, and call overhead—our implementation traded compactness for simplicity, which is worth it for the vast majority of scenarios. -Possible directions for next steps: implement `RepeatingCallback` (a copyable, repeatedly invocable version), add lifecycle helper functions like `Unretained` / `Owned` / `WeakPtr` to `bind_once`, or use Google Benchmark for precise performance measurements. +Next steps to try: implement `repeating_callback` (copyable, repeatable version), add `base::PassThrough` / `base::RawPtr` / `base::WeakPtr` lifetime helpers to `once_callback`, or use Google Benchmark for precise performance measurement. -## References +## Reference Resources -- [Chromium base/functional/ source directory](https://source.chromium.org/chromium/chromium/src/+/main:base/functional/) +- [Chromium base/functional/ Source Directory](https://source.chromium.org/chromium/chromium/src/+/main:base/functional/) - [cppreference: std::move_only_function](https://en.cppreference.com/w/cpp/utility/functional/move_only_function) -- [Google Test documentation](https://google.github.io/googletest/) -- [Google Benchmark documentation](https://github.com/google/benchmark) +- [Google Test Documentation](https://google.github.io/googletest/) +- [Google Benchmark Documentation](https://github.com/google/benchmark) diff --git a/documents/projects/index.md b/documents/projects/index.md index a62368c97..f0f9f10f0 100644 --- a/documents/projects/index.md +++ b/documents/projects/index.md @@ -1,6 +1,6 @@ --- title: "贯穿式实战项目" -description: "手写 STL、HTTP 服务器、GUI 框架、嵌入式 OS 等综合项目" +description: "把各卷学到的零散知识串成完整项目——从协程服务器、迷你运行时到工业级组件研读" platform: host tags: - cpp-modern @@ -10,19 +10,24 @@ tags: # 贯穿式实战项目 -> 状态:规划中 +> 这一栏不是新知识的堆砌,而是把各卷学到的碎片——并发、协程、模板、内存管理——串成一个能跑、能测、能交付的完整项目。下面先列出已经在其它卷里落地、可以直接接着做的项目,再列出还在规划中的远期目标。 -## 概述 +## 已经有基础的项目 -本栏目包含贯穿教程的综合实战项目: +这些项目在其它卷里已经有了教程或可运行骨架,从这里可以顺藤摸瓜深入: -1. **手写 STL 组件**:vector、string、unique_ptr、optional、function、variant -2. **迷你 HTTP 服务器**:从 TCP 到异步协程 -3. **迷你 GUI 框架**:事件循环、控件系统、布局引擎、渲染后端 -4. **嵌入式操作系统**:调度器、同步原语、内存管理、驱动框架 -5. **INI 解析器**(已有基础) -6. **协程 Echo 服务器**(已有基础) +- **协程 Echo 服务器**:在 [卷五·协程 Echo 服务器](../vol5-concurrency/ch06-async-io-coroutine/05-coroutine-echo-server.md) 里,从 `co_await` 一路搭到了一个能收发的回显服务,是理解协程调度最实在的练手项目。 +- **迷你并发运行时(capstone)**:[卷五·迷你运行时 capstone](../vol5-concurrency/exercises/06-capstone-mini-runtime.md) 把线程池、定时器、任务队列揉成一个最小调度器,是后续做 "Mini Concurrent Runtime" 的现成起点。 +- **OnceCallback 组件研读**:[卷九·OnceCallback](../vol9-open-source-project-learn/chrome/01_once_callback/index.md) 用 16 篇文章手撕了 Chromium 的回调机制,是从读源码走向 "自己设计工业级组件" 的范例。 +- **INI 解析器**:作为 C++ 工程化的第一个完整项目,放在独立仓库 [Tutorial_cpp_SimpleIniParser](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_cpp_SimpleIniParser)——从词法分析到错误处理,适合跟着做一遍。 -## 项目导航 +## 规划中的项目 -> 内容编写中,敬请期待。 +这些还没动工,是远期目标,按 "素材就绪程度" 排序: + +- **手写 STL 组件**:vector / string / unique_ptr / optional / function / variant 各手写一遍,配合卷三标准库的源码阅读。 +- **迷你 HTTP 服务器**:从 TCP socket 到协程异步化,承接卷五并发和卷八网络编程。 +- **迷你 GUI 框架**:事件循环、控件系统、布局引擎、渲染后端。 +- **嵌入式迷你 OS**:调度器、同步原语、内存管理、驱动框架,承接卷八嵌入式主线。 + +> 这些项目都不会一蹴而就,会随着对应卷的内容完善而逐步启动。如果你有想做的项目,欢迎在 Discussion 里提出。 diff --git a/documents/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md b/documents/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md index c3c34b893..9cd856025 100644 --- a/documents/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md +++ b/documents/vol1-fundamentals/c_tutorials/12-struct-and-memory-alignment.md @@ -476,21 +476,21 @@ using AlignedStorage = std::aligned_storage_t #include -// TODO: 定义 Frame 结构体 +// 练习: 定义 Frame 结构体 // typedef struct __attribute__((packed)) { // ... // } Frame; -// TODO: 实现 print_frame_layout() 函数 +// 练习: 实现 print_frame_layout() 函数 // 使用 offsetof 打印每个字段的偏移量 -// TODO: 实现 create_frame() 函数 +// 练习: 实现 create_frame() 函数 // 分配内存并填充帧数据(含柔性数组成员) int main(void) { print_frame_layout(); - // TODO: 创建一个测试帧并验证偏移 + // 练习: 创建一个测试帧并验证偏移 return 0; } ``` diff --git a/documents/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md b/documents/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md index 32bccf4be..db825499e 100644 --- a/documents/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md +++ b/documents/vol1-fundamentals/c_tutorials/13-union-enum-bitfield-typedef.md @@ -355,15 +355,15 @@ using EventHandler = void (*)(int); // 比 typedef 更直观 #include #include -// TODO: 定义一个联合体,包含 float 和 uint32_t -// TODO: 实现分解函数 +// 练习: 定义一个联合体,包含 float 和 uint32_t +// 练习: 实现分解函数 // void print_float_bits(float f) { // // 提取符号位(1位)、指数(8位)、尾数(23位) // // 提示:用位运算 & 和 >> // } int main(void) { - // TODO: 测试几个值:0.0f, -3.14f, 1.0f, 42.0f, 0.1f + // 练习: 测试几个值:0.0f, -3.14f, 1.0f, 42.0f, 0.1f return 0; } ``` @@ -376,7 +376,7 @@ int main(void) { #include #include -// TODO: 定义 ControlRegister 位域结构体 +// 练习: 定义 ControlRegister 位域结构体 // 位分配: // bit 0: enable (1位) // bit 1: interrupt_enable (1位) @@ -386,16 +386,16 @@ int main(void) { // bit 31:10 reserved (22位) typedef union { - // TODO: 位域结构体视图 - // TODO: uint32_t 整体视图 + // 练习: 位域结构体视图 + // 练习: uint32_t 整体视图 } ControlRegister; -// TODO: 实现 void print_register(ControlRegister reg) -// TODO: 实现 void set_mode(ControlRegister* reg, uint32_t mode) +// 练习: 实现 void print_register(ControlRegister reg) +// 练习: 实现 void set_mode(ControlRegister* reg, uint32_t mode) int main(void) { ControlRegister reg = {0}; - // TODO: 测试各个操作 + // 练习: 测试各个操作 return 0; } ``` @@ -408,16 +408,16 @@ int main(void) { #include #include -// TODO: 定义枚举类型标签 -// TODO: 定义 tagged union 结构体 -// TODO: 实现构造函数 make_int/make_float/make_string -// TODO: 实现 print_tagged_value 函数 -// TODO: 实现 get_as_int/get_as_float/get_as_string 安全访问函数 +// 练习: 定义枚举类型标签 +// 练习: 定义 tagged union 结构体 +// 练习: 实现构造函数 make_int/make_float/make_string +// 练习: 实现 print_tagged_value 函数 +// 练习: 实现 get_as_int/get_as_float/get_as_string 安全访问函数 // (检查 tag 是否匹配,不匹配则打印错误信息) int main(void) { - // TODO: 创建三种类型的值,打印它们 - // TODO: 尝试用错误的 tag 访问,验证安全检查 + // 练习: 创建三种类型的值,打印它们 + // 练习: 尝试用错误的 tag 访问,验证安全检查 return 0; } ``` diff --git a/documents/vol1-fundamentals/c_tutorials/14-dynamic-memory.md b/documents/vol1-fundamentals/c_tutorials/14-dynamic-memory.md index 567e0d138..97e6bfc81 100644 --- a/documents/vol1-fundamentals/c_tutorials/14-dynamic-memory.md +++ b/documents/vol1-fundamentals/c_tutorials/14-dynamic-memory.md @@ -246,9 +246,9 @@ void pool_free(MemoryPool* pool, void* block); void pool_destroy(MemoryPool* pool); int main(void) { - // TODO: 创建一个 64 字节/块、共 64 块的内存池 - // TODO: 分配几个块,写入数据,然后释放 - // TODO: 销毁内存池 + // 练习: 创建一个 64 字节/块、共 64 块的内存池 + // 练习: 分配几个块,写入数据,然后释放 + // 练习: 销毁内存池 return 0; } ``` @@ -275,9 +275,9 @@ void mem_report(void); #define TMALLOC(size) tracked_malloc((size), __FILE__, __LINE__) int main(void) { - // TODO: 用 TMALLOC 分配几块内存 - // TODO: 故意只释放其中一部分 - // TODO: 调用 mem_report() 查看哪些分配没有被释放 + // 练习: 用 TMALLOC 分配几块内存 + // 练习: 故意只释放其中一部分 + // 练习: 调用 mem_report() 查看哪些分配没有被释放 return 0; } ``` diff --git a/documents/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md b/documents/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md index 0720325db..22baa11f0 100644 --- a/documents/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md +++ b/documents/vol1-fundamentals/c_tutorials/15-preprocessor-and-multifile.md @@ -194,18 +194,18 @@ gcc -o demo main.c -L. -lmath_utils ```c // math_utils.h #pragma once -// TODO: 声明 clamp_int 和 count_digits +// 练习: 声明 clamp_int 和 count_digits // math_utils.c #include "math_utils.h" -// TODO: 实现 clamp_int(将 value 限制在 [min_val, max_val] 范围内) -// TODO: 实现 count_digits(计算整数的十进制位数) +// 练习: 实现 clamp_int(将 value 限制在 [min_val, max_val] 范围内) +// 练习: 实现 count_digits(计算整数的十进制位数) // main.c #include #include "math_utils.h" int main(void) { - // TODO: 调用两个函数,验证结果 + // 练习: 调用两个函数,验证结果 return 0; } ``` @@ -219,9 +219,9 @@ int main(void) { #pragma once #ifdef NDEBUG -// TODO: Release 模式——DEBUG_LOG 展开为空 +// 练习: Release 模式——DEBUG_LOG 展开为空 #else -// TODO: Debug 模式——输出 [DEBUG] 文件名:行号: 格式化消息 +// 练习: Debug 模式——输出 [DEBUG] 文件名:行号: 格式化消息 // 提示:使用 __FILE__、__LINE__、__VA_ARGS__ #endif ``` diff --git a/documents/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md b/documents/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md index a508c22e4..f1fb2f218 100644 --- a/documents/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md +++ b/documents/vol1-fundamentals/c_tutorials/16-file-io-and-stdlib.md @@ -297,7 +297,7 @@ int main(int argc, char* argv[]) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } - // TODO: 调用 parse_config 和 find_config + // 练习: 调用 parse_config 和 find_config return 0; } ``` @@ -317,7 +317,7 @@ int main(int argc, char* argv[]) { /// @brief 复制文件 int copy_file(const char* src_path, const char* dst_path) { - // TODO: 实现 + // 练习: 实现 // 1. "rb" 打开源文件,"wb" 打开目标文件 // 2. 循环 fread/fwrite // 3. 用 fseek/ftell 获取总大小,打印进度 @@ -326,7 +326,7 @@ int copy_file(const char* src_path, const char* dst_path) } int main(int argc, char* argv[]) { - // TODO: 解析命令行参数,调用 copy_file + // 练习: 解析命令行参数,调用 copy_file return 0; } ``` diff --git a/documents/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md b/documents/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md index 4c12d7aef..98b197666 100644 --- a/documents/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md +++ b/documents/vol1-fundamentals/c_tutorials/advanced_feature/01-arm-architecture-fundamentals.md @@ -348,7 +348,7 @@ ARM 平台上的 C++ 对象内存布局遵循 AAPCS 的 ABI 规范:普通成 /// 提示:IPSR 是 xPSR 的一部分,可以用 MRS 指令读取 uint32_t exercise_read_ipsr(void) { - // TODO: 用内嵌汇编读取 IPSR + // 练习: 用内嵌汇编读取 IPSR return 0; } ``` @@ -361,7 +361,7 @@ uint32_t exercise_read_ipsr(void) /// 提示:HardFault Handler 的参数可以拿到栈帧指针 void exercise_trigger_hardfault(void) { - // TODO: 写一个无效地址来触发 HardFault + // 练习: 写一个无效地址来触发 HardFault } ``` @@ -372,13 +372,13 @@ void exercise_trigger_hardfault(void) /// 找出编译器如何分配 R4-R11 给局部变量 int exercise_aapcs_4(int a, int b, int c, int d) { - // TODO: 添加局部变量和函数调用,使反汇编更有看头 + // 练习: 添加局部变量和函数调用,使反汇编更有看头 return 0; } int exercise_aapcs_6(int a, int b, int c, int d, int e, int f) { - // TODO: 同上,对比反汇编结果 + // 练习: 同上,对比反汇编结果 return 0; } ``` diff --git a/documents/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md b/documents/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md index aeac4e557..6a1b6b91f 100644 --- a/documents/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md +++ b/documents/vol1-fundamentals/c_tutorials/advanced_feature/03-c-traps-and-pitfalls.md @@ -428,7 +428,7 @@ char* result = malloc(strlen(s) + strlen(t) + 1); // OK,+1 给 '\0' /// @return a / (*b) int fix_lexical_trap(int a, int* b) { - // TODO: 修复代码中的陷阱 + // 练习: 修复代码中的陷阱 return a/*b; } ``` @@ -441,7 +441,7 @@ int fix_lexical_trap(int a, int* b) /// @return 1 表示低 4 位全为零,0 表示至少有一位非零 int fix_priority_trap(unsigned int flags) { - // TODO: 修复代码中的陷阱 + // 练习: 修复代码中的陷阱 return flags & 0x0F == 0; } ``` @@ -455,7 +455,7 @@ int fix_priority_trap(unsigned int flags) /// @return 1 表示相等,0 表示不等 int fix_assignment_trap(int x, int target) { - // TODO: 修复代码中的陷阱 + // 练习: 修复代码中的陷阱 if (x = target) return 1; return 0; @@ -471,7 +471,7 @@ int fix_assignment_trap(int x, int target) /// @return 数组中的最大值 int fix_semicolon_trap(int* arr, int n) { - // TODO: 修复代码中的陷阱 + // 练习: 修复代码中的陷阱 int max_val = arr[0]; for (int i = 1; i < n; i++) if (arr[i] > max_val); @@ -489,7 +489,7 @@ int fix_semicolon_trap(int* arr, int n) /// @return 1 表示会溢出,0 表示安全 int fix_overflow_check(int a, int b) { - // TODO: 修复代码中的陷阱 + // 练习: 修复代码中的陷阱 if (a + b < 0) return 1; return 0; @@ -505,7 +505,7 @@ int fix_overflow_check(int a, int b) /// @return 新分配的拼接字符串,调用者负责释放 char* fix_string_concat(const char* s, const char* t) { - // TODO: 修复代码中的所有陷阱 + // 练习: 修复代码中的所有陷阱 char* result = malloc(strlen(s) + strlen(t)); strcpy(result, s); strcat(result, t); diff --git a/documents/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md b/documents/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md index 1d03d08ad..97c993ef4 100644 --- a/documents/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md +++ b/documents/vol1-fundamentals/c_tutorials/advanced_feature/05-handmade-dynamic-array.md @@ -506,7 +506,7 @@ DynamicArrayStatus dynamic_array_resize( size_t new_size, const void* default_value ); -// TODO: 自行实现 +// 练习: 自行实现 ``` ### 练习 2:实现 filter @@ -519,7 +519,7 @@ DynamicArray* dynamic_array_filter( const DynamicArray* arr, int (*pred)(const void* element) ); -// TODO: 自行实现 +// 练习: 自行实现 ``` ### 练习 3:实现 map 变换 @@ -534,7 +534,7 @@ DynamicArray* dynamic_array_map( void (*transform)(const void* in, void* out), size_t out_element_size ); -// TODO: 自行实现 +// 练习: 自行实现 ``` ### 练习 4:实现拼接 @@ -547,7 +547,7 @@ DynamicArray* dynamic_array_concat( const DynamicArray* arr1, const DynamicArray* arr2 ); -// TODO: 自行实现 +// 练习: 自行实现 ``` > **难度自评**:如果你在实现练习时感到困难,请回顾对应章节的设计思路。特别是 resize——它本质上是 reserve + memset/memcpy 的组合,想清楚哪些位置需要填充、填充什么值,代码自然就出来了。 diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md b/documents/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md index e230c7be4..da2e1e302 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/01-concept-based-generic-programming/01-type-safety-and-number-concept.md @@ -41,7 +41,7 @@ Target: x86_64-pc-linux-gnu Configured with: /build/gcc/src/gcc/configure --enable-languages=ada,c,c++,d,fortran,go,lto,m2,objc,obj-c++,rust,cobol --enable-bootstrap --prefix=/usr --libdir=/usr/lib --libexecdir=/usr/lib --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=https://gitlab.archlinux.org/archlinux/packaging/packages/gcc/-/issues --with-build-config=bootstrap-lto --with-linker-hash-style=gnu --with-system-zlib --enable-cet=auto --enable-checking=release --enable-clocale=gnu --enable-default-pie --enable-default-ssp --enable-gnu-indirect-function --enable-gnu-unique-object --enable-libstdcxx-backtrace --enable-link-serialization=1 --enable-linker-build-id --enable-lto --enable-multilib --enable-plugin --enable-shared --enable-threads=posix --disable-libssp --disable-libstdcxx-pch --disable-werror --disable-fixincludes Thread model: posix Supported LTO compression algorithms: zlib zstd -gcc version 16.1.1 20260430 (GCC) +gcc version 16.1.1 20260430 (GCC) ❯ uname -a Linux Charliechen 6.6.114.1-microsoft-standard-WSL2 #1 SMP PREEMPT_DYNAMIC Mon Dec 1 20:46:23 UTC 2025 x86_64 GNU/Linux @@ -71,15 +71,15 @@ int main() { int big = 30000; short small = big; // 30000 超出了 short 的范围吗?其实没有,short 一般是 -32768~32767 // 但如果是 40000 呢? - + short overflow = 40000; // 编译通过!但值已经错了 - + double pi = 3.14159; int int_pi = pi; // 小数部分直接丢了 - + std::cout << "overflow = " << overflow << "\n"; // 输出一个奇怪的负数 std::cout << "int_pi = " << int_pi << "\n"; // 输出 3 - + return 0; } ``` @@ -136,7 +136,7 @@ concept number = std::integral || std::floating_point; // 判断 T 是否"比 U 小"(能表示的值更少) // 这里用 numeric_limits 的范围来比较 template -concept smaller_range = +concept smaller_range = number && number && (std::numeric_limits::max() < std::numeric_limits::max() || std::numeric_limits::min() > std::numeric_limits::min()); @@ -520,7 +520,7 @@ constexpr T safe_add(T a, T b) { } ``` -验证代码见 `code/volumn_codes/vol10/cppcon/2025/01-concept-based-generic-programming/01-06-overflow-not-caught.cpp`。 +验证代码见 [01-06-overflow-not-caught.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/01-concept-based-generic-programming/01-06-overflow-not-caught.cpp)。 ::: 看最后一个溢出捕获的例子——我们需要注意,`narrow_convert` 只能拦截**类型转换时**的窄化,对于同类型算术运算本身的溢出(如 `unsigned int + unsigned int` 的回绕),它是无能为力的。`common_type_t` 就是 `unsigned int` 本身,运算结果在赋值给 `Number` 之前就已经回绕成了一个合法值。要完整防御算术溢出,需要额外的机制(如编译器内置的 overflow 检查函数),这超出了 `narrow_convert` 的职责范围。 @@ -1046,8 +1046,8 @@ int main() { 输出: ```text -前3个: 10 20 30 -中间3个: 30 40 50 +前3个: 10 20 30 +中间3个: 30 40 50 捕获: take_front: n 超过了 span 的大小 ``` diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/01-personal-journey-and-from-assembly-to-cpp.md b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/01-personal-journey-and-from-assembly-to-cpp.md index ac1023980..f9ebdb557 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/01-personal-journey-and-from-assembly-to-cpp.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/01-personal-journey-and-from-assembly-to-cpp.md @@ -622,7 +622,7 @@ max_diff: 0.000000e+00 这反而更加印证了文章的核心论点:现代编译器的自动向量化越来越强,手写 SIMD 的收益在缩小。具体数字因硬件和编译器版本而异,但趋势是一致的。 -验证代码:`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/02-00-matmul-test.cpp` +验证代码:[02-00-matmul-test.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/02-00-matmul-test.cpp) ::: 这就是 2026 年和 90 年代的区别。在 90 年代,编译器完全不知道 SIMD 是什么,手写汇编可能快 10 倍;在今天,编译器已经相当聪明了,手写的收益越来越小,但代价(可读性、可维护性、可移植性)依然很大。 diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/02-reading-assembly-and-registers-abi.md b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/02-reading-assembly-and-registers-abi.md index 86cfed6f5..25441e47b 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/02-reading-assembly-and-registers-abi.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/02-reading-assembly-and-registers-abi.md @@ -667,3 +667,8 @@ _Z9make_bigll:
--- + +## 延伸阅读 + +- 想看懂不同优化级别(`-O0` / `-O2` / `-O3`)下编译器到底吐出什么汇编,见 [卷七·编译器选项](../../../../vol7-engineering/02-compiler-options.md)。 +- 想深入 SIMD/AVX 如何重塑汇编输出,见 [卷六·AVX/AVX2 深入](../../../../vol6-performance/avx-avx2-deep-dive.md)。 diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md index ee808f16b..602b55b36 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/03-compiler-explorer-and-ai-assisted.md @@ -463,3 +463,8 @@ C++ 中的模块化不只是"怎么写头文件和源文件",更是"怎么把
--- + +## 延伸阅读 + +- Compiler Explorer 是观察编译器行为的最佳窗口,想系统梳理 GCC/Clang 各选项的效果,见 [卷七·编译器选项](../../../../vol7-engineering/02-compiler-options.md)。 +- 在 CE 里看自动向量化如何启用 AVX/AVX2,见 [卷六·AVX/AVX2 深入](../../../../vol6-performance/avx-avx2-deep-dive.md)。 diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md index e60c6e04a..8d22fd52f 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/06-toolchain-and-project-design.md @@ -399,3 +399,9 @@ macOS: url="https://cmake.org/cmake/help/latest/release/3.28.html" /> + +--- + +## 延伸阅读 + +- 工具链的核心是编译器选项,想系统梳理 GCC/Clang 的常用编译选项与取舍,见 [卷七·编译器选项](../../../../vol7-engineering/02-compiler-options.md)。 diff --git a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md index b515d1e5f..066b06dac 100644 --- a/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md +++ b/documents/vol10-open-lecture-notes/cppcon/2025/02-some-assembly-required/07-wg21-standardization-and-assembly-philosophy.md @@ -231,7 +231,7 @@ $ arm-none-linux-gnueabihf-gcc -static -march=armv7-a test.c && qemu-arm-static qemu: uncaught target signal 4 (Illegal instruction) - core dumped ``` -验证代码见仓库:`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c`。 +验证代码见仓库:[05-01-arm32-nv-condition.c](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c)。 ::: ## 正交性——ARM32 的设计哲学 @@ -249,7 +249,7 @@ qemu: uncaught target signal 4 (Illegal instruction) - core dumped 我们可以亲手验证一下这个东西。因为 NV 条件码只在 ARMv4 及更早版本中有效,我们需要明确指定架构版本。 ::: details 为什么不能用 ARMv7? -ARMv7-A 的有效条件码范围仅为 `0b0000`–`0b1110`。编码 `0b1111` 在 ARMv5+ 中被重新分配——它要么被解释为完全不同的指令(利用条件码位来扩展操作码空间),要么产生 UNPREDICTABLE 行为。在 ARMv7 上用 `.word 0xf3a0002a`,**不能保证**结果是"永不执行"。验证代码已放在仓库中(`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c`),读者可以自行在 ARMv4 和 ARMv7 目标上对比测试。 +ARMv7-A 的有效条件码范围仅为 `0b0000`–`0b1110`。编码 `0b1111` 在 ARMv5+ 中被重新分配——它要么被解释为完全不同的指令(利用条件码位来扩展操作码空间),要么产生 UNPREDICTABLE 行为。在 ARMv7 上用 `.word 0xf3a0002a`,**不能保证**结果是"永不执行"。验证代码已放在仓库中([05-01-arm32-nv-condition.c](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-01-arm32-nv-condition.c)),读者可以自行在 ARMv4 和 ARMv7 目标上对比测试。 ::: 环境是 Arch Linux WSL,交叉编译工具链用的 `arm-none-linux-gnueabihf-gcc`(Arm GNU Toolchain 15.2)。注意编译时需要用 `-march=armv4` 来确保 NV 条件码的语义: @@ -470,7 +470,7 @@ g++ -O3 -march=x86-64-v2 -S /tmp/test.cpp -o /tmp/test.s grep pcmpeqb /tmp/test.s # 无输出 = 没有向量化 ``` -验证代码见仓库:`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-04-count-char-vec.cpp`。 +验证代码见仓库:[05-04-count-char-vec.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-04-count-char-vec.cpp)。 ::: 真实的 GCC 输出(简化后的核心循环): @@ -591,7 +591,7 @@ inline constexpr LogLevel log_error = LogLevel::Error; int main() { g_log_level = LogLevel::Debug; - + // 这样调用,带逗号的表达式完全没问题 log(log_debug, "value is {}", std::max(1, 2)); log(log_info, "program started"); @@ -660,7 +660,7 @@ pi = 3.14159 /tmp/test.cpp:7:34: error: call to consteval function 'is_little_endian()' is not a constant expression ``` -验证代码见仓库:`code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-02-consteval-endian-broken.cpp`(编译失败)和 `05-03-consteval-endian-fixed.cpp`(修正版,编译通过)。读者可以用 `g++ -std=c++20 05-02-consteval-endian-broken.cpp` 自行验证编译失败。 +验证代码见仓库:[05-02-consteval-endian-broken.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-02-consteval-endian-broken.cpp)(编译失败)和 [05-03-consteval-endian-fixed.cpp](https://github.com/Awesome-Embedded-Learning-Studio/Tutorial_AwesomeModernCPP/blob/main/code/volumn_codes/vol10/cppcon/2025/02-some-assembly-required/05-03-consteval-endian-fixed.cpp)(修正版,编译通过)。读者可以用 `g++ -std=c++20 05-02-consteval-endian-broken.cpp` 自行验证编译失败。 ::: 修正后有两种编译期判断字节序的方法: diff --git a/documents/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md b/documents/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md index 698afe9db..cba7f776f 100644 --- a/documents/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md +++ b/documents/vol2-modern-features/ch02-constexpr/02-constexpr-ctor.md @@ -11,7 +11,7 @@ tags: - 编译期计算 difficulty: intermediate platform: host -cpp_standard: [11, 14, 17] +cpp_standard: [11, 14, 17, 20] reading_time_minutes: 15 prerequisites: - "Chapter 2: constexpr 基础" @@ -344,7 +344,7 @@ static_assert(kObj.values[3] == 13); static_assert(kObj.sum == 46); // 10+11+12+13=46 ``` -## 第五步——constexpr 析构函数(C++20 预告) +## 第五步——constexpr 析构函数(C++20) 在 C++20 之前,字面类型要求析构函数必须是平凡的(trivial)。这意味着你不能在析构函数里做任何清理工作。这个限制在 C++20 中被取消——你可以写 `constexpr` 析构函数了。 diff --git a/documents/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md b/documents/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md index 94a054f24..0a6fe5b64 100644 --- a/documents/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md +++ b/documents/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md @@ -407,6 +407,17 @@ auto angle = 3.14159_rad; 每个数字后面都带着它的单位,代码几乎不需要注释(看着是真的爽啊!) +## 小结 + +用户自定义字面量本质上是用编译期能力给"裸数字"穿上单位的衣服——`100_ms`、`72_MHz`、`4_KiB` 一眼就能看懂,所有转换都在编译期完成,运行时零开销。记住几条要点: + +- `operator""` 有四种 cooked 形式(`unsigned long long` / `long double` / `const char*` / `char`)外加一种 raw 形式(字符串模板)。日常用 cooked 就够,只有要解析自定义数字语法(二进制、千分位)才上 raw。 +- 后缀一律**下划线开头**(`_ms`)。不带下划线的后缀(`ms`)是留给标准库的,自己用迟早踩雷。 +- 先用标准库现成的(`chrono` 的 `1h/1min/1s`、`"abc"s`、`"abc"sv`),不够再造自己的。 +- 字面量是编译期常量,可以放心塞进 `constexpr`、模板参数、数组尺寸。 + +代价几乎为零,收益是把"这个数到底是什么单位"的疑问从 code review 里彻底消灭。怎么在真实工程里组织一整套自己的字面量库,留到 UDL 实战篇再展开。 + ## 参考资源 - [cppreference: User-defined literals](https://en.cppreference.com/w/cpp/language/user_literal) diff --git a/documents/vol8-domains/embedded/04-empty-base-optimization.md b/documents/vol4-advanced/03-empty-base-optimization.md similarity index 97% rename from documents/vol8-domains/embedded/04-empty-base-optimization.md rename to documents/vol4-advanced/03-empty-base-optimization.md index e847aaab9..c3dbde8d5 100644 --- a/documents/vol8-domains/embedded/04-empty-base-optimization.md +++ b/documents/vol4-advanced/03-empty-base-optimization.md @@ -1,21 +1,22 @@ --- -chapter: 3 +chapter: 11 cpp_standard: - 11 - 14 - 17 - 20 -description: 介绍空基类优化技术 +description: 介绍空基类优化(EBO)与 C++20 [[no_unique_address]] difficulty: intermediate -order: 4 -platform: stm32f1 +order: 6 +platform: host prerequisites: -- 'Chapter 2: 零开支抽象' +- 'Chapter 2: 零开销抽象' reading_time_minutes: 6 tags: +- host - cpp-modern - intermediate -- stm32f1 +- 零开销抽象 title: 空基类优化(EBO) --- # 空基类优化(EBO):C++ 的瘦身技巧 diff --git a/documents/vol4-advanced/05-spaceship-operator.md b/documents/vol4-advanced/05-spaceship-operator.md index 95ea1cbf3..346a5be4a 100644 --- a/documents/vol4-advanced/05-spaceship-operator.md +++ b/documents/vol4-advanced/05-spaceship-operator.md @@ -922,32 +922,56 @@ struct Task { ## 常见的坑 -### 坑1:忘记显式定义== +### 坑1:默认 == 不会反向生成 <=>(生成方向是单向的) -在C++20中,`<=>`不会自动生成`==`运算符,必须显式定义: +一个流传很广、但已经过时的说法是:「只写 `<=>` 不写 `==` 会编译报错」。这在 C++20 早期草案里一度成立,后来被 **P1185(Consistent defaulted comparisons,作为 C++20 缺陷报告落地)** 修正——`<=>` 和 `==` 的生成关系是**单向**的: + +- default `<=>` → 编译器顺手把 `==`、`!=`、`<`、`>`、`<=`、`>=` 全都生成出来。所以只写 `<=>` 完全够用,`==` 是「白送」的。 +- 反过来 default `==` → 只生成 `==` 和 `!=`,不会反向给你 `<=>` 或任何关系运算符。 + +真正会踩的坑是后者:你以为「我只关心判等,default 一个 `==` 就够了」,结果哪天有人写了一句 `a < b`,编译直接炸——因为 `==` 不带关系运算。 ```cpp -// ❌ 错误:只有<=>,没有== -struct Bad { +#include +#include + +// ✅ 只 default <=>:== 和 < 都自动有了(旧说法里那个「编译错误」其实是错的) +struct HasSpaceship { int value; - auto operator<=>(const Bad&) const = default; - // 缺少 bool operator==(const Bad&) const = default; + auto operator<=>(const HasSpaceship&) const = default; }; -Bad b1{1}, b2{1}; -// bool eq = (b1 == b2); // 编译错误! - -// ✅ 正确:同时定义<=>和== -struct Good { +// ⚠️ 只 default ==:判等没问题,但拿不到 < / <=> +struct HasEquality { int value; - auto operator<=>(const Good&) const = default; - bool operator==(const Good&) const = default; + bool operator==(const HasEquality&) const = default; }; -Good g1{1}, g2{1}; -bool eq = (g1 == g2); // OK +int main() { + HasSpaceship a{1}, b{2}; + std::cout << (a == b) << (a < b) << '\n'; // OK:<=> 把 == 和 < 都生成出来了 + + HasEquality c{1}, d{2}; + std::cout << (c == d) << '\n'; // OK:显式 default 了 == + // std::cout << (c < d) << '\n'; // 编译错误:default == 不反向生成 <=> +} ``` +实测(Arch Linux WSL,`-std=c++20`;g++ 16.1.1 与 clang++ 22.1.6 行为一致): + +```text +$ g++ -std=c++20 gotcha.cpp -o gotcha && ./gotcha +01 +0 +$ g++ -std=c++20 -DTRY_LT gotcha.cpp +gotcha.cpp: In function 'int main()': +gotcha.cpp:23:21: error: no match for 'operator<' (operand types are 'HasEquality' and 'HasEquality') + 23 | std::cout << (c < d) << '\n'; + | ~ ^ ~ +``` + +一句话记法:`<=>` 是「上游」,`==` 是「下游」——上游会顺流送出所有运算符,下游只管自己那一亩三分地。只要你想要任何一种大小比较,就得有 `<=>`;只 default `==` 永远换不来 `<=>`。详见 cppreference「[Default comparisons](https://en.cppreference.com/mwiki/index.php?title=cpp/language/default_comparisons)」一节。 + ### 坑2:比较类别不一致 手动实现时,返回的比较类别要一致: diff --git a/documents/vol4-advanced/index.md b/documents/vol4-advanced/index.md index bd9474375..2f00b4956 100644 --- a/documents/vol4-advanced/index.md +++ b/documents/vol4-advanced/index.md @@ -38,5 +38,6 @@ tags: 三路比较运算符 + 空基类优化(EBO) C++ Modules (MSVC) diff --git a/documents/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md b/documents/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md index 7778bb602..9e454824b 100644 --- a/documents/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md +++ b/documents/vol5-concurrency/ch03-atomic-memory-model/05-atomic-patterns.md @@ -22,6 +22,8 @@ related: # 原子操作模式 +> 📖 **应用场景**:这一篇的原子模式在嵌入式里有个高频落地——ISR 和主循环之间无锁共享变量。如果你在写单片机固件,配着 [卷八·中断安全编程](../../vol8-domains/embedded/05-interrupt-safe-coding.md)看会更通透。 + 到这一篇为止,我们已经把 `std::atomic` 的操作集、六种内存序、fence 和屏障、`wait/notify` 和 `atomic_ref` 全部拆解完了。但这些工具单独拿出来,只是在回答"怎么做"的问题——怎么做一个原子加法、怎么发一个 release store、怎么等一个值变化。真正的工程实践需要的是模式:面对一个具体的并发问题,应该选用哪些原子操作,以什么样的内存序组合起来,才能既正确又高效地解决问题。 这一篇我们集中讨论几个最经典的原子操作模式。这些模式不是凭空发明的——它们来自 Linux 内核、数据库引擎、高性能网络框架等真实系统中反复验证过的方案。我们会拆解每个模式的"为什么":为什么这样设计、为什么内存序不能更弱、为什么某个看似无害的改动会引入 bug。 diff --git a/documents/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md b/documents/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md index 3a3d7cacd..ece133d7d 100644 --- a/documents/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md +++ b/documents/vol5-concurrency/ch06-async-io-coroutine/01-async-programming-evolution.md @@ -23,6 +23,8 @@ related: # 异步编程演进:从回调地狱到协程 +> 📖 **前置阅读**:这一篇会用到 C++20 协程。如果你还没接触过 `co_await`/`co_return`、`promise_type` 这些底层机制,可以先翻 [卷四·协程基础](../../vol4-advanced/01-coroutine-basics.md)——那里从零拆解了协程的"骨架"是怎么搭起来的。 + 说实话,写到这一篇的时候笔者是有点感慨的。我们在前面的章节里一直在跟线程、锁、原子操作打交道,这些工具给了我们精确的控制力——代价是你得自己管所有事情。线程的创建和销毁、同步机制的设计、结果从子线程搬回主线程、异常怎么传回来,每次写一个并发任务都要重复这套流程。ch05 里我们用 `std::async` 和 `std::future` 简化了一些工作,但你很快就会发现:当你需要把多个异步操作串联起来——先读文件,再解析数据,最后写回结果——future 链的管理就变得非常笨拙。 这就是异步编程要解决的核心问题:**如何优雅地组织和组合多个异步操作**。这个问题不是 C++ 独有的,几乎所有语言都在经历同样的演进——从回调(callback)到 future/promise 链,再到协程(coroutine)。这一篇我们要把这个演进脉络从头到尾理清楚,看清楚每种模型的动机是什么、解决了什么问题、又引入了什么新问题,最后理解为什么 C++20 协程被很多人认为是"异步编程的正确打开方式"。 diff --git a/documents/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md b/documents/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md index a79950022..c701ccfa0 100644 --- a/documents/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md +++ b/documents/vol5-concurrency/ch08-debug-testing-perf/02-concurrency-benchmarks.md @@ -24,6 +24,8 @@ related: # 并发性能测试与基准 +> 📖 **深入阅读**:这篇只讲并发场景下的基准测试。更通用的性能工程——benchmark 方法论、cache 友好性、SIMD/AVX、读汇编——是 [卷六·性能工程](../../vol6-performance/index.md)的主场。 + 上一篇我们解决了正确性问题——用 TSan 抓 data race、用 Helgrind 查锁顺序、用 Clang TSA 在编译期预防线程安全违规。但是,一个正确的并发程序不等于一个高效的并发程序。笔者见过太多这样的场景:某人花了三天把一个 mutex 换成了无锁队列,兴奋地宣布"性能提升了 3 倍",结果一看 benchmark 方法——单次运行、没有预热、编译器差点把整个循环优化没了、连 `UseRealTime` 都没加。你测出来的"3 倍提升"可能只是测量误差。 这一篇我们要解决的核心问题是:如何科学地测量并发程序的性能。我们会从 Google Benchmark 的基础用法开始,然后深入并发 benchmark 的设计陷阱(这里面坑多得超乎想象),再通过一个实战案例对比不同同步方案的真实性能差异,最后介绍 `perf stat` 这个 Linux 下的性能计数器工具——它能告诉你程序到底慢在哪里。 diff --git a/documents/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md b/documents/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md index a62822e95..f7733d110 100644 --- a/documents/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md +++ b/documents/vol5-concurrency/ch09-distributed-bridge/01-from-concurrent-to-distributed.md @@ -25,6 +25,8 @@ related: # 从单机并发到分布式 +> ℹ️ **本节定位**:这一章是概念导览,不配可运行代码、也不引入外部框架。目的是让你在进入卷八的分布式实战之前,先把"单机并发 → 分布式"的认知框架搭起来——知道哪些旧经验还能用,哪些得推倒重来。 + 整卷我们都在讲同一台机器上的并发——一个进程里的多线程怎么安全地共享数据、怎么用原子操作做无锁同步、怎么用协程把异步代码写好看。这些知识非常扎实,但它们都建立在一个隐含的前提上:所有线程共享同一块内存,跑在同一个操作系统上,由同一个调度器管理。 现实是残酷的。当你的服务需要处理更多的请求、存储更多的数据时,一台机器迟早是不够的——不管是 CPU 算力、内存容量还是网络带宽,总有一个维度会先碰到天花板。你不得不把服务部署到多台机器上,让它们协同工作。这时候,"并发"的问题就从进程内扩展到了网络上。你面对的不再是一个 `std::mutex`,而是一个跨网络的锁协调服务;不再是 `std::atomic`,而是一组需要就某个值达成一致的分布式副本。 diff --git a/documents/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md b/documents/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md index b0b5b2497..fab490382 100644 --- a/documents/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md +++ b/documents/vol5-concurrency/ch09-distributed-bridge/02-distributed-primitives.md @@ -23,6 +23,8 @@ related: # 分布式一致性原语初探 +> ℹ️ **本节定位**:承接上一篇,继续概念导览。这里讲的一致性模型谱系同样不配可运行代码,重在帮你建立"从强一致到弱一致"的直觉,为日后读分布式论文和卷八实战打底。 + 上一篇我们看到了单机并发和分布式系统的五大根本差异,理解了"网络不可靠、时钟不准确、局部失败不可避免"这些分布式环境下的事实。说实话,笔者第一次接触分布式一致性的时候是有被震撼到的——在单机上,一致性几乎是"免费的"(代价只是几纳秒的 lock/unlock),但到了分布式环境下,它变成了一个需要你用论文级别的协议、多轮网络通信、多数派投票才能换取的东西。这一篇我们就要面对这个核心难题——**一致性(consistency)**。 我们先建立一个直觉:当一份数据在多台机器上都有副本时,客户端从不同的副本读到的是否是同一个值?什么时候读到的是最新的值?不同副本之间的数据差多少?这些问题的答案取决于系统选择了什么样的一致性模型。一致性模型不是二选一的(要么一致要么不一致),而是一个从强到弱的谱系——理解这个谱系,是理解分布式系统的基本功,也是我们这篇的核心线索。 diff --git a/documents/vol5-concurrency/index.md b/documents/vol5-concurrency/index.md index 69f2c0242..ade48685d 100644 --- a/documents/vol5-concurrency/index.md +++ b/documents/vol5-concurrency/index.md @@ -25,4 +25,4 @@ tags: ch07 · Actor 与 Channel ch08 · 调试、测试与性能 ch09 · 分布式桥接附录 - \ No newline at end of file + diff --git a/documents/vol8-domains/embedded/00-env-setup/index.md b/documents/vol8-domains/embedded/00-env-setup/index.md new file mode 100644 index 000000000..135b99bee --- /dev/null +++ b/documents/vol8-domains/embedded/00-env-setup/index.md @@ -0,0 +1,24 @@ +--- +title: "开发环境搭建" +description: "从工具链、项目结构、CMake 到 WSL2 USB 透传与 GDB 调试,搭起 STM32 开发的完整脚手架" +platform: stm32f1 +tags: + - cpp-modern + - intermediate + - stm32f1 +--- + +# 开发环境搭建 + +> 从交叉编译工具链到完整 GDB 调试环境,把 STM32 开发的地基一次铺好——后面所有实战都站在这套环境上。 + +## 工具链与项目结构 + +- [第1篇:从零搭建 STM32 开发工具链](01-toolchain-setup.md) — 交叉编译原理与安装指南 +- [第2篇:项目结构篇](02-project-structure.md) — HAL 库获取、启动文件坑位与目录搭建 +- [CMake 配置篇](03-cmake-configuration.md) — 从零构建 STM32 构建系统 + +## WSL2 与调试 + +- [环境搭建(四):WSL2 USB 透传](04-wsl2-usb.md) — 让 ST-Link 穿越虚拟化边界 +- [第5篇:调试进阶篇](05-debugging-guide.md) — 从 printf 到完整 GDB 调试环境 diff --git a/documents/vol8-domains/embedded/01-led/index.md b/documents/vol8-domains/embedded/01-led/index.md new file mode 100644 index 000000000..1adeadc80 --- /dev/null +++ b/documents/vol8-domains/embedded/01-led/index.md @@ -0,0 +1,44 @@ +--- +title: "LED 点灯:从 C 到 C++ 的演进" +description: "以点亮 PC13 上的 LED 为线索,从 C 宏驱动一路重构到 C++23 模板与零开销抽象" +platform: stm32f1 +tags: + - cpp-modern + - intermediate + - stm32f1 +--- + +# LED 点灯:从 C 到 C++ 的演进 + +> 一盏 LED,一条完整的现代 C++ 重构之路——从 HAL 寄存器操作到模板与 `if constexpr` 的编译期优化。 + +## 动机 + +- [第6篇:从点亮第一盏 LED 开始](01-motivation-and-overview.md) — 我们为什么要用现代 C++ 写 STM32 + +## 硬件基础 + +- [第7篇:GPIO 到底是什么](02-what-is-gpio.md) — 通用输入输出的前世今生 +- [第8篇:推挽、开漏与 PC13](03-output-modes-and-pc13.md) — LED 点亮的硬件秘密 + +## HAL 操作 + +- [第9篇:HAL 时钟使能](04-hal-gpio-clock.md) — 不开时钟,外设就是一坨睡死的硅 +- [第10篇:HAL_GPIO_Init](05-hal-gpio-init.md) — 把引脚配置告诉芯片的仪式 +- [第11篇:HAL_GPIO_WritePin 与 TogglePin](06-hal-gpio-output.md) — 让引脚动起来 + +## C 宏时代 + +- [第12篇:C 宏时代的 LED 驱动](07-c-macro-led-implementation.md) — 能跑但不优雅 + +## C++ 重构演进 + +- [第13篇:第一次重构 —— enum class](08-cpp-enum-class-revolution.md) — 取代宏,类型安全的开始 +- [第14篇:第二次重构 —— 模板](09-cpp-template-gpio.md) — 编译时绑定端口和引脚 +- [第15篇:第三次重构 —— if constexpr](10-cpp-if-constexpr-clock.md) — 让时钟使能在编译时自动选对 +- [第16篇:第四次重构 —— LED 模板](11-cpp-led-template.md) — 从通用 GPIO 到专用抽象 +- [第17篇:C++23 特性收尾](12-cpp23-attributes-and-features.md) — 属性、链接与零开销抽象的最终证明 + +## 总结 + +- [第18篇:常见坑位与实战练习](13-pitfalls-and-exercises.md) — 把 LED 玩出花样来 diff --git a/documents/vol8-domains/embedded/02-button/index.md b/documents/vol8-domains/embedded/02-button/index.md new file mode 100644 index 000000000..3d4249222 --- /dev/null +++ b/documents/vol8-domains/embedded/02-button/index.md @@ -0,0 +1,43 @@ +--- +title: "按键输入:消抖、状态机与类型安全" +description: "从 GPIO 输入电路到 7 状态消抖状态机,再用 variant/concepts 把按键代码重构成类型安全的形态" +platform: stm32f1 +tags: + - cpp-modern + - intermediate + - stm32f1 +--- + +# 按键输入:消抖、状态机与类型安全 + +> 按键比 LED 难——硬件抖动、非阻塞消抖、状态机,再加上 C++ 的类型安全重构。 + +## 动机 + +- [第19篇:从输出到输入](01-from-output-to-input.md) — 为什么按钮比 LED 难 + +## 硬件基础 + +- [第20篇:GPIO 输入模式内部电路](02-gpio-input-circuits.md) — 芯片是如何 "听" 到外部信号的 +- [第21篇:按钮电路与机械抖动](03-button-hardware-and-bounce.md) — 真实世界的信号长什么样 + +## HAL 与轮询 + +- [第22篇:HAL GPIO 输入 API](04-hal-gpio-input.md) — 怎么用代码读到按钮状态 +- [第23篇:C 语言轮询按钮](05-c-polling-button.md) — 第一次亲手让按钮控制 LED + +## 消抖 + +- [第24篇:非阻塞消抖](06-non-blocking-debounce.md) — 不让 CPU 停下来等 +- [第25篇:7 状态消抖状态机](07-debounce-state-machine.md) — 本系列的核心 + +## C++ 重构演进 + +- [第26篇:`enum class` 重构按钮代码](08-cpp-enum-class-button.md) — 类型安全的输入 +- [第27篇:`std::variant` 事件 + `std::visit` 分发](09-cpp-variant-and-visit.md) — 类型安全的 "发生了什么" +- [第28篇:Button 模板类设计](10-cpp-template-button.md) — 把一切交给编译器 +- [第29篇:Concepts 约束回调](11-cpp-concepts-callback.md) — 完整代码走读 + +## 中断与总结 + +- [第30篇:EXTI 中断](12-exti-interrupt-and-exercises.md) — 坑位与练习 diff --git a/documents/vol8-domains/embedded/06-array-vs-raw-arrays.md b/documents/vol8-domains/embedded/06-array-vs-raw-arrays.md deleted file mode 100644 index b92f2e9f2..000000000 --- a/documents/vol8-domains/embedded/06-array-vs-raw-arrays.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -chapter: 5 -cpp_standard: -- 11 -- 14 -- 17 -- 20 -description: 对比std::array与传统数组 -difficulty: intermediate -order: 6 -platform: stm32f1 -prerequisites: -- 'Chapter 3: 内存与对象管理' -reading_time_minutes: 5 -tags: -- cpp-modern -- intermediate -- stm32f1 -title: std::array vs 一般数组 ---- -# 嵌入式C++教程——`std::array` vs C 数组,你们知道嘛? - -写嵌入式代码时,你大概率会在两种写法间犹豫:`int buf[16];` 和 `std::array buf;`。如果你像我(或者像你之前的文章风格那样)既爱性能又爱优雅,你会想知道:哪种更"嵌入式友好"? - ------- - -## 为什么 `std::array` 看上去像"C 数组穿了件外套"——但其实更聪明 - -从表面看,`std::array` 底层就是包含一个 `T elems[N]` 的聚合类型:内存上元素是连续的,元素布局没有神秘的开销。所以在很多场景下,`std::array` 与裸数组的性能、内存占用是等价的。换句话说,你不会因为换成 `std::array` 而在运行时付出额外费用。 - -但 `std::array` 把数组包进了一个类型:它有值语义(可以拷贝、赋值)、有 `.size()`、有 `.data()`、有 `begin()`/`end()`、能和 STL 算法无缝对接、支持 `constexpr`(在现代编译器下),还能作为模板参数被更好地推断。最关键的是,它把"长度是类型的一部分"这一信息显式化,调用接口时更不容易丢失长度信息。 - -换句话说:`std::array` 是"更安全、更现代"的数组。 - ------- - -## 裸 C 数组的老实巴交与致命天真 - -裸数组的优点是"零抽象",也就是你对内存完全可控:这在启动代码、驱动层、位于特定地址空间的缓冲区(比如映射到某个外设寄存器地址)非常重要。裸数组在 ABI、链接器、对齐方面不给你出难题——只要你知道自己在做什么,它非常可靠。 - -但裸数组也带来一堆常见的踩坑:它会在函数参数中退化为指针(因此 `sizeof` 在函数里会给出指针大小),不能直接拷贝赋值(`b = a;` 会编译不过),也没有任何边界或尺寸信息保护。嵌入式代码里,这些"方便的缺失"会让你经常写 `memcpy`、频繁查 `N` 是否写对、在审查时犯"忘记传长度"的低级错误。 - -一个真实场景:你把裸数组传给 C API 做 DMA,忘了告诉调用方长度,结果 DMA 越界写到你最珍贵的变量上。裸数组没有提醒你这类低概率高代价错误。 - ------- - -## `std::array` 的优点:更安全、更可读、和现代 C++ 更友好 - -`std::array` 的日常优势可以总结为:语义清晰、接口友好、可与算法直接配合。例如,`std::sort(a.begin(), a.end())` 或 `std::span(a)` 都是顺手可得的好处。`std::array` 可以 `=`, 复制,甚至作为函数返回值安全返回(不会退化),这在很多中层逻辑里能让代码更简洁、更少内存操作 bug。 - -在嵌入式上下文,这意味着测试代码、单元测试桩、缓冲区封装这些地方会更干净:你可以写成返回 `std::array` 的函数而不是整堆 `memcpy`。而且当编译器支持 `constexpr` 时,`std::array` 能在编译期构造常量表,代码既高效又安全。 - ------- - -## 那么什么时候应该继续用裸 C 数组? - -`std::array` 很好,但并不是无敌。在下面几类场景,裸数组仍然是更合适的选择: - -1. **初始化阶段或早期引导代码(startup / crt0)**:在 `main()` 之前,C++ 的全局构造规则和运行时支持可能会麻烦。裸数组在这类代码里更直白、更可靠,尤其是当你需要绝对确保没有任何构造器或运行时代码介入时。 -2. **放在特定链接段 / 放到固定地址**:像中断向量表、设备映射缓冲区、bootloader 的表格等,往往需要在链接脚本里精确声明对象位置和字节序。裸数组更直接映射到期望的内存布局,减少不必要的抽象。 -3. **严格的 ABI 或与外部 C API 的互操作,且你需要写裸指针**:虽然 `std::array` 有 `.data()`,但在一些非常讲究二进制兼容性的场景中,审计时用裸数组更直观(尤其是老代码基)。 -4. **极端资源受限且要避免编译器生成任何额外元信息**:这类情况稀有,但存在于某些超嵌入式或者内核最底层代码中。 - ------- - -## 所以怎么说? - -裸数组是简洁、可靠的工具,适合最接近硬件的那一层;`std::array` 是更现代、更安全、更贴合 C++ 思想的容器,适合业务逻辑、算法层以及绝大多数嵌入式应用代码。把二者当作工具箱里的两把刀:修理芯片引脚用军刀(裸数组),写协议解析和缓冲逻辑用精密小刀(`std::array`)。 - -最后一句鸡汤式建议:当你能把数组尺寸写成 `std::array` 的模板参数时,就写成 `std::array`;当你必须在链接脚本或最早期的引导代码里精确控制每个字节时,回到裸数组,别害羞。嵌入式开发不是为了"保持纯粹",而是为了按实际需要用对工具——`std::array` 很多时候会让你代码更少、错误更少,偶尔你还是得把手伸进裸内存去修一修底层。 - ------- - -## 在线运行 - -在线运行 std::array vs C 数组的对比示例,验证零开销抽象: - - - -## 代码示例 diff --git a/documents/vol8-domains/embedded/core-embedded-cpp-index.md b/documents/vol8-domains/embedded/core-embedded-cpp-index.md index eba10fa99..dae8de89d 100644 --- a/documents/vol8-domains/embedded/core-embedded-cpp-index.md +++ b/documents/vol8-domains/embedded/core-embedded-cpp-index.md @@ -44,7 +44,7 @@ order: 0 ## Chapter 3 - 内存与对象管理 - [初始化列表](../../vol3-standard-library/01-initializer-lists.md) -- [空基类优化(EBO)](./04-empty-base-optimization.md) +- [空基类优化(EBO)](../../vol4-advanced/03-empty-base-optimization.md) - [对象大小,平凡类型](../../vol3-standard-library/05-object-size-and-trivial-types.md) ## Chapter 4 - 编译期计算 @@ -58,7 +58,6 @@ order: 0 - [对象池模式](./03-object-pool-pattern.md) - [禁用 heap 或限制 heap 时的替代策略:放置new(Placement New)的使用](./04-placement-new.md) - [固定池分配](./05-fixed-pool-allocation.md) -- [array vs 一般数组,你们知道嘛?](./06-array-vs-raw-arrays.md) ## Chapter 7 - 容器与数据结构 diff --git a/documents/vol8-domains/embedded/index.md b/documents/vol8-domains/embedded/index.md index 3c3cbd1fe..c55e2cc94 100644 --- a/documents/vol8-domains/embedded/index.md +++ b/documents/vol8-domains/embedded/index.md @@ -10,23 +10,20 @@ tags: # 嵌入式开发 -> 状态:规划中 +> 现代 C++ 在资源受限的嵌入式系统里能做什么、不能做什么——从零开销抽象、内存管理到外设编程、中断并发,再到 STM32 实战与 RTOS。 -## 概述 +## STM32F1 实战系列 -本子领域覆盖现代 C++ 在嵌入式系统中的应用,包括资源约束、零开销抽象、内存管理、外设编程、中断并发、STM32 实战、RTOS 等。 +这是一条从零开始、用现代 C++ 写 STM32 的完整路线,按 "环境 → LED → 按键 → 串口" 的顺序展开,每个外设都从 C 起步,一路重构到 C++23: -预计 45-50 篇文章。 +- [开发环境搭建](00-env-setup/index.md) — 工具链、项目结构、CMake、WSL2 USB 透传、GDB 调试 +- [LED 点灯:从 C 到 C++ 的演进](01-led/index.md) — 从 HAL 寄存器到模板与 `if constexpr` +- [按键输入:消抖、状态机与类型安全](02-button/index.md) — 从轮询到 `variant`/`concepts` +- [UART 串口通信](03-uart/index.md) — 从协议到中断驱动、`std::expected` 错误处理 -## 章节导航 +## 嵌入式专题文章 -> 内容编写中,敬请期待。 - -## 现有内容(待重写) - -本目录包含从旧教程迁移的大量嵌入式相关文章和 STM32F1 实战系列,后续将全面重写。 - -### 文章列表 +这些是从旧教程迁移来的专题文章,覆盖零开销抽象、内存管理、寄存器访问、中断安全等主题,可作为实战系列的横向补充: 嵌入式现代 C++ 教程——零开销抽象 @@ -35,12 +32,10 @@ tags: 嵌入式 C++ 教程——静态存储与栈上分配策略 嵌入式 C++ 教程:对象池模式 编译期多态 vs 运行时多态 - 空基类优化(EBO):C++ 的瘦身技巧 嵌入式 C++ 教程:placement new 嵌入式 C++ 教程:Slab / Arena 实现与比较 嵌入式 C++ 教程——ETL 中断安全的代码编写 - 嵌入式 C++ 教程——std::array vs C 数组 循环缓冲区 侵入式容器设计 类型安全的寄存器访问 diff --git a/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md b/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md index fa426c9f3..343d72977 100644 --- a/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md +++ b/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/01-6-once-callback-testing-and-perf.md @@ -243,7 +243,7 @@ Chromium 总是堆分配(`new BindState`),但分配只发生一次。之 ## 小结 -这一篇我们做了两件事。测试方面,围绕六个不变量(基本调用、移动语义、单次调用、参数绑定、取消机制、链式组合)设计了 11 个 Catch2 测试用例,覆盖了 OnceCallback 的所有核心行为。性能方面,对比了与 Chromium OnceCallback 在对象大小、分配行为和调用开销上的差异——我们的实现用紧凑性换来了简洁性。 +这一篇我们做了两件事。测试方面,围绕六个不变量(基本调用、移动语义、单次调用、参数绑定、取消机制、链式组合)设计了 12 个 Catch2 测试用例,覆盖了 OnceCallback 的所有核心行为。性能方面,对比了与 Chromium OnceCallback 在对象大小、分配行为和调用开销上的差异——我们的实现用紧凑性换来了简洁性。 到这里,OnceCallback 组件的设计、实现和验证就全部完成了。13 篇文章从前置知识到实战,覆盖了从 C++11 移动语义到 C++23 deducing this 的完整知识链。希望这个系列能帮助你理解"如何用现代 C++ 设计一个工业级的组件"——不仅仅是写代码,更重要的是理解每一个设计决策背后的原因。 diff --git a/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md b/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md index afe17612e..a18c83e81 100644 --- a/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md +++ b/documents/vol9-open-source-project-learn/chrome/01_once_callback/full/index.md @@ -33,7 +33,7 @@ 前置知识章节中涉及的 C++ 独立示例代码已提炼为可编译的最小工程,位于: -``` +```text code/volumn_codes/vol9/full_tutorial_codes/chrome_design/ ``` diff --git a/documents/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md b/documents/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md index d59c1b822..76c392f99 100644 --- a/documents/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md +++ b/documents/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md @@ -284,7 +284,7 @@ code/volumn_codes/vol9/chrome_design/ ## 小结 -这篇验证篇我们做了两件事。测试方面,围绕六个不变量(基本调用、移动语义、单次调用、参数绑定、取消机制、链式组合)设计了 11 个 Catch2 测试用例,覆盖了 `OnceCallback` 的所有核心行为。性能方面,对比了与 Chromium `OnceCallback` 在对象大小、分配行为和调用开销上的差异——我们的实现用紧凑性换来了简洁性,对绝大多数场景来说这个取舍是值得的。 +这篇验证篇我们做了两件事。测试方面,围绕六个不变量(基本调用、移动语义、单次调用、参数绑定、取消机制、链式组合)设计了 12 个 Catch2 测试用例,覆盖了 `OnceCallback` 的所有核心行为。性能方面,对比了与 Chromium `OnceCallback` 在对象大小、分配行为和调用开销上的差异——我们的实现用紧凑性换来了简洁性,对绝大多数场景来说这个取舍是值得的。 下一步可以尝试的方向:实现 `RepeatingCallback`(可复制、可重复调用的版本),给 `bind_once` 添加 `Unretained` / `Owned` / `WeakPtr` 等生命周期辅助函数,或者用 Google Benchmark 做精确的性能测量。 diff --git a/todo/000-project-roadmap.md b/todo/000-project-roadmap.md index 1ad51d458..f76776e47 100644 --- a/todo/000-project-roadmap.md +++ b/todo/000-project-roadmap.md @@ -75,8 +75,8 @@ estimated_effort: large ## Acceptance -- [ ] 新 TODO 文件数控制在 15-20 个。 -- [ ] 每个旧 TODO 有明确归宿或丢弃说明。 -- [ ] 每卷都有资产、缺口、价值判断。 -- [ ] 当前 `todo/` 只保留内容发展 TODO 和社区建设 TODO。 +- [x] 新 TODO 文件数控制在 15-20 个。(当前 14 个:13 编号/治理 + README,已大幅收敛) +- [x] 每个旧 TODO 有明确归宿或丢弃说明。 +- [x] 每卷都有资产、缺口、价值判断。 +- [x] 当前 `todo/` 只保留内容发展 TODO 和社区建设 TODO。 - [ ] 进入深度优化和发展规划阶段前完成一次人工审阅。 diff --git a/todo/031-qa-knowledge-base.md b/todo/031-qa-knowledge-base.md index 557956501..f0f9223a1 100644 --- a/todo/031-qa-knowledge-base.md +++ b/todo/031-qa-knowledge-base.md @@ -18,7 +18,7 @@ estimated_effort: medium - `documents/appendix/terminology.md` 已有术语表。 - 文档中已有零散 StackOverflow、cppreference、WG21 链接。 - `.github/DISCUSSION_TEMPLATE/` 已有方向讨论模板。 -- `todo/091-community-qa-knowledge-base.md` 有旧规划。 +- 旧 `091-community-qa-knowledge-base.md` 的规划已合并进本文件,原文件已移除。 ## Gaps @@ -53,10 +53,10 @@ estimated_effort: medium ## Old TODO Merge -- `091-community-qa-knowledge-base.md` +- [x] `091-community-qa-knowledge-base.md`(已合并进本文件,原文件已移除) ## Acceptance - [ ] FAQ 入口设计完成。 - [ ] QA index 结构完成。 -- [ ] CONTRIBUTING 中说明如何提交 QA 条目。 +- [x] CONTRIBUTING 中说明如何提交 QA 条目。(已落地于 CONTRIBUTING.md「QA 与知识库规则」段) From 7de3df82dcc9986403376e25cc14d96d92f3854b Mon Sep 17 00:00:00 2001 From: Charliechen114514 <725610365@qq.com> Date: Sat, 13 Jun 2026 20:37:38 +0800 Subject: [PATCH 2/2] ci: fix the read time issue of en version --- .../ch11-user-defined-literals/01-udl-basics.md | 2 +- .../01_once_callback/hands_on/03-once-callback-testing.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md b/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md index ea52bc82d..2bb7e7139 100644 --- a/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md +++ b/documents/en/vol2-modern-features/ch11-user-defined-literals/01-udl-basics.md @@ -14,7 +14,7 @@ cpp_standard: - 11 - 14 - 17 -reading_time_minutes: 15 +reading_time_minutes: 9 prerequisites: - 'Chapter 2: constexpr 基础' related: diff --git a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md index 03c16d04d..3f76a0e09 100644 --- a/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md +++ b/documents/en/vol9-open-source-project-learn/chrome/01_once_callback/hands_on/03-once-callback-testing.md @@ -15,7 +15,7 @@ difficulty: advanced platform: host cpp_standard: - 23 -reading_time_minutes: 20 +reading_time_minutes: 13 prerequisites: - once_callback 设计指南(一):动机与接口设计 - once_callback 设计指南(二):逐步实现