diff --git a/book/i18n/ko/src/puzzle_27/puzzle_27.md b/book/i18n/ko/src/puzzle_27/puzzle_27.md index d6bface1..2e0b1430 100644 --- a/book/i18n/ko/src/puzzle_27/puzzle_27.md +++ b/book/i18n/ko/src/puzzle_27/puzzle_27.md @@ -48,10 +48,12 @@ GPU 스레드 블록 (128 스레드, 4개 또는 2개 워프, 하드웨어 조 # 복잡한 블록 전체 리덕션 (기존 방식 - Puzzle 12에서): shared_memory[local_i] = my_value barrier() -for stride in range(64, 0, -1): +stride = 64 +while stride > 0: if local_i < stride: shared_memory[local_i] += shared_memory[local_i + stride] barrier() + stride //= 2 if local_i == 0: output[block_idx.x] = shared_memory[0] @@ -81,10 +83,12 @@ if local_i == 0: shared_memory[local_i] = my_value barrier() # 스트라이드 기반 인덱싱을 사용한 트리 리덕션... -for stride in range(64, 0, -1): +stride = 64 +while stride > 0: if local_i < stride: shared_memory[local_i] += shared_memory[local_i + stride] barrier() + stride //= 2 ``` ### **중간 단계: 워프 프로그래밍 (Puzzle 24)** diff --git a/book/src/puzzle_27/puzzle_27.md b/book/src/puzzle_27/puzzle_27.md index 87a8e6e3..4bc6a885 100644 --- a/book/src/puzzle_27/puzzle_27.md +++ b/book/src/puzzle_27/puzzle_27.md @@ -46,10 +46,12 @@ Learn the complete parallel programming toolkit from `gpu.primitives.block`: # Complex block-wide reduction (traditional approach - from Puzzle 12): shared_memory[local_i] = my_value barrier() -for stride in range(64, 0, -1): +stride = 64 +while stride > 0: if local_i < stride: shared_memory[local_i] += shared_memory[local_i + stride] barrier() + stride //= 2 if local_i == 0: output[block_idx.x] = shared_memory[0] @@ -79,10 +81,12 @@ Complex but educational - explicit shared memory, barriers, and tree reduction: shared_memory[local_i] = my_value barrier() # Tree reduction with stride-based indexing... -for stride in range(64, 0, -1): +stride = 64 +while stride > 0: if local_i < stride: shared_memory[local_i] += shared_memory[local_i + stride] barrier() + stride //= 2 ``` ### **The intermediate step: Warp programming (Puzzle 24)**