The universal chunking config: chunking_enabled and chunking_rules

Fallen-Breath · Fallen-Breath · commit e995ba4ef8f0 · 2026-04-23T03:04:38.000+08:00
diff --git a/docs/config.md b/docs/config.md
@@ -225,10 +225,15 @@ Configs on how the backup is made
     ],
     "mutating_file_patterns": [],
 
-    "cdc_enabled": false,
-    "cdc_file_size_threshold": 104857600,
-    "cdc_patterns": [
-       "**/*.db"
+    "chunking_enabled": false,
+    "chunking_rules": [
+        {
+            "algorithm": "cdc",
+            "file_size_threshold": 104857600,
+            "patterns": [
+                "**/*.db"
+            ]
+        }
     ],
 
     "hash_method": "blake3",
@@ -441,52 +446,53 @@ and can speed up the processing of such files during backup creation
 - Type: `List[str]`
 - Default: `[]`
 
-#### cdc_enabled
+#### chunking_enabled
 
-Whether to enable content-defined chunking (CDC) for large files during backup creation
+Whether to enable file chunking during backup creation
 
-CDC stands for `Content-Defined Chunking`.
-Unlike fixed-size chunking, CDC determines chunk boundaries from the file content itself,
-so when data is inserted, deleted, or modified locally, many unchanged regions can still be cut into the same chunks and be reused across backups
+When enabled, Prime Backup iterates through [chunking_rules](#chunking_rules) in order for each file.
+The first rule whose `patterns` match the file path and whose `file_size_threshold` is met will be applied.
+If no rule matches, the file is stored as a regular direct blob without chunking
 
 Changing this option only affects files newly stored in future backups.
 Existing direct blobs or chunked blobs will not be converted automatically
 
-!!! note
-
-    CDC chunking requires the optional `pyfastcdc` dependency.
-    You can install all optional dependencies with `pip3 install -r requirements.optional.txt`,
-    or install `pyfastcdc` manually
-
 - Type: `bool`
 - Default: `false`
 
-#### cdc_file_size_threshold
+#### chunking_rules
 
-The minimum file size in bytes for a file to be considered for CDC chunking
+A list of chunking rules evaluated in order when [chunking_enabled](#chunking_enabled) is `true`
 
-Files smaller than this threshold will continue to use the regular direct blob storage flow,
-even if [cdc_enabled](#cdc_enabled) is enabled and the path matches [cdc_patterns](#cdc_patterns)
+For each file, Prime Backup walks through this list and applies the first rule whose `patterns` match the file path and whose `file_size_threshold` is met.
+If no rule matches, the file is stored as a regular direct blob
 
-Changing this option only affects files newly stored in future backups.
-Existing stored data will not be repartitioned automatically
+Each rule contains the following fields:
 
-- Type: `int`
-- Default: `104857600` (`100 MiB`)
+- `algorithm`: The chunking algorithm to use. Currently only `"cdc"` is available
 
-#### cdc_patterns
+    CDC stands for Content-Defined Chunking. Unlike fixed-size chunking, CDC determines chunk boundaries from the file content itself,
+    so when data is inserted, deleted, or modified locally, many unchanged regions can still be cut into the same chunks and be reused across backups
 
-A list of [gitignore flavor](http://git-scm.com/docs/gitignore) pattern strings,
-matched against file paths relative to [source_root](#source_root)
+    !!! note
 
-CDC chunking will only be applied when the file path matches one of these patterns,
-the file size reaches [cdc_file_size_threshold](#cdc_file_size_threshold),
-and [cdc_enabled](#cdc_enabled) is enabled
+        CDC chunking requires the optional `pyfastcdc` dependency.
+        You can install all optional dependencies with `pip3 install -r requirements.optional.txt`,
+        or install `pyfastcdc` manually
 
-The default value is `["**/*.db"]`.
-It is recommended to keep this list narrow and only include large files that are often modified locally and really need to be backed up
+- `file_size_threshold`: The minimum file size in bytes for a file to be eligible for this rule.
+  Files smaller than this value will not match this rule, even if their path matches `patterns`
 
-- Type: `List[str]`
+- `patterns`: A list of [gitignore flavor](http://git-scm.com/docs/gitignore) pattern strings,
+  matched against file paths relative to [source_root](#source_root)
+
+The default value contains one rule that applies CDC chunking to `.db` files larger than 100 MiB.
+It is recommended to keep the rules narrow and only cover large files that are often modified locally and really need to be backed up
+
+Changing this option only affects files newly stored in future backups.
+Existing stored data will not be repartitioned automatically
+
+- Type: `List[ChunkingRule]`
 
 #### hash_method
 
diff --git a/docs/config.zh.md b/docs/config.zh.md
@@ -225,10 +225,15 @@ Prime Backup 在创建备份时的操作时序如下：
     ],
     "mutating_file_patterns": [],
 
-    "cdc_enabled": false,
-    "cdc_file_size_threshold": 104857600,
-    "cdc_patterns": [
-       "**/*.db"
+    "chunking_enabled": false,
+    "chunking_rules": [
+        {
+            "algorithm": "cdc",
+            "file_size_threshold": 104857600,
+            "patterns": [
+                "**/*.db"
+            ]
+        }
     ],
 
     "hash_method": "blake3",
@@ -441,50 +446,55 @@ Prime Backup 会检查文件的如下这些信息。下述这些信息完全一
 - 类型：`List[str]`
 - 默认值：`[]`
 
-#### cdc_enabled
+#### chunking_enabled
 
-是否在创建备份时，对大文件启用内容定义分块（CDC）
+是否在创建备份时，对文件启用分块存储
 
-CDC 是 `Content-Defined Chunking` 的缩写，即“按内容划分边界”的切块方式。
-它与固定大小切块不同，数据块边界由文件内容决定，因此当文件仅在局部发生增删改时，许多未变化的内容仍能被切成相同的数据块，从而复用已有数据块
+启用时，Prime Backup 会对每个文件依次遍历 [chunking_rules](#chunking_rules) 中的规则，
+并将第一条匹配的规则所指定的算法应用于该文件。
+若没有任何规则匹配，该文件将以常规直存数据对象（direct blob）的方式储存，不进行分块
 
 修改此选项只会影响后续备份中新写入的文件。
 已存在的直存数据对象（direct blob）或分块数据对象（chunked blob）不会被自动转换
 
-!!! note
-
-    CDC 分块需要可选依赖 `pyfastcdc`。
-    你可以通过 `pip3 install -r requirements.optional.txt` 安装全部可选依赖，
-    或者单独安装 `pyfastcdc`
-
 - 类型：`bool`
 - 默认值：`false`
 
-#### cdc_file_size_threshold
+#### chunking_rules
 
-文件参与 CDC 分块所需达到的最小大小，单位为字节。
+分块规则列表，在 [chunking_enabled](#chunking_enabled) 为 `true` 时，按顺序逐条匹配
 
-小于该阈值的文件，即使 [cdc_enabled](#cdc_enabled) 已启用、路径也匹配了 [cdc_patterns](#cdc_patterns)，
-仍会继续使用常规的直存数据对象（direct blob）存储流程
+对于每个文件，Prime Backup 会依次遍历该列表，将第一条 `patterns` 匹配文件路径、
+且文件大小达到 `file_size_threshold` 的规则应用于该文件。
+若无规则匹配，文件将以常规直存数据对象（direct blob）的方式储存
 
-修改此选项只会影响后续备份中新写入的文件。
-已入库的数据不会被自动重新切分
+每条规则包含以下字段：
 
-- 类型：`int`
-- 默认值：`104857600`（`100 MiB`）
+- `algorithm`：分块时使用的算法。目前仅支持 `"cdc"`
 
-#### cdc_patterns
+    CDC 是 Content-Defined Chunking（按内容划分边界的切块方式）的缩写。
+    它与固定大小切块不同，数据块边界由文件内容决定，因此当文件仅在局部发生增删改时，
+    许多未变化的内容仍能被切成相同的数据块，从而复用已有数据块
 
-一个 [gitignore 风格](http://git-scm.com/docs/gitignore) 的模板串列表，
-匹配对象是相对于 [source_root](#source_root) 的文件路径
+    !!! note
 
-只有当文件路径匹配这些模式、文件大小达到 [cdc_file_size_threshold](#cdc_file_size_threshold)、
-且 [cdc_enabled](#cdc_enabled) 已启用时，才会使用 CDC 分块
+        CDC 分块需要可选依赖 `pyfastcdc`。
+        你可以通过 `pip3 install -r requirements.optional.txt` 安装全部可选依赖，
+        或者单独安装 `pyfastcdc`
 
-默认值为 `["**/*.db"]`。
-建议将其控制得尽量精确，只包含那些体积大、经常发生局部修改、且确实需要备份的文件
+- `file_size_threshold`：文件参与本规则所需达到的最小大小，单位为字节。
+  小于此值的文件不会匹配本规则，即使其路径匹配了 `patterns`
 
-- 类型：`List[str]`
+- `patterns`：一个 [gitignore 风格](http://git-scm.com/docs/gitignore) 的模板串列表，
+  匹配对象是相对于 [source_root](#source_root) 的文件路径
+
+默认值中包含一条规则，对大于 100 MiB 的 `.db` 文件启用 CDC 分块。
+建议将规则控制得尽量精确，只包含那些体积大、经常发生局部修改、且确实需要备份的文件
+
+修改此选项只会影响后续备份中新写入的文件。
+已入库的数据不会被自动重新切分
+
+- 类型：`List[ChunkingRule]`
 
 #### hash_method
 
diff --git a/prime_backup/config/backup_config.py b/prime_backup/config/backup_config.py
@@ -3,12 +3,24 @@
 from mcdreforged.api.utils import Serializable
 
 from prime_backup.compressors import CompressMethod
+from prime_backup.types.chunk_method import ChunkMethod
 from prime_backup.types.hash_method import HashMethod
 
 if TYPE_CHECKING:
 	import pathspec
 
 
+class ChunkingRule(Serializable):
+	algorithm: ChunkMethod
+	file_size_threshold: int
+	patterns: List[str] = []
+
+	@property
+	def patterns_spec(self) -> 'pathspec.GitIgnoreSpec':
+		from prime_backup.utils import pathspec_utils
+		return pathspec_utils.compile_gitignore_spec(self.patterns)
+
+
 class BackupConfig(Serializable):
 	# Source
 	source_root: str = './server'
@@ -31,20 +43,18 @@ class BackupConfig(Serializable):
 	]
 	mutating_file_patterns: List[str] = []
 
-	# Content-Define-Chunking for Large files
-	cdc_enabled: bool = False
-	cdc_file_size_threshold: int = 100 * 1048576  # 100MiB
-	cdc_patterns: List[str] = [
-		'**/*.db',
+	# Chunking
+	chunking_enabled: bool = False
+	chunking_rules: List[ChunkingRule] = [
+		ChunkingRule(
+			algorithm=ChunkMethod.cdc,
+			file_size_threshold=100 * 1048576,
+			patterns=[
+				'**/*.db'
+			],
+		),
 	]
 
-	# Fixed 4K chunking for .mca region files
-	# f4c_enabled: bool = False
-	# f4c_file_size_threshold: int = 128 * 1024  # 128KiB
-	# f4c_patterns: List[str] = [
-	# 	'**/*.mca',
-	# ]
-
 	# Storage
 	hash_method: HashMethod = HashMethod.blake3
 	compress_method: CompressMethod = CompressMethod.zstd
@@ -101,16 +111,6 @@ def creation_skip_missing_file_patterns_spec(self) -> 'pathspec.GitIgnoreSpec':
 		from prime_backup.utils import pathspec_utils
 		return pathspec_utils.compile_gitignore_spec(self.creation_skip_missing_file_patterns)
 
-	@property
-	def cdc_patterns_spec(self) -> 'pathspec.GitIgnoreSpec':
-		from prime_backup.utils import pathspec_utils
-		return pathspec_utils.compile_gitignore_spec(self.cdc_patterns)
-
-	# @property
-	# def f4c_patterns_spec(self) -> 'pathspec.GitIgnoreSpec':
-	# 	from prime_backup.utils import pathspec_utils
-	# 	return pathspec_utils.compile_gitignore_spec(self.f4c_patterns)
-
 	@property
 	def mutating_file_patterns_spec(self) -> 'pathspec.GitIgnoreSpec':
 		from prime_backup.utils import pathspec_utils
diff --git a/prime_backup/types/chunk_method.py b/prime_backup/types/chunk_method.py
@@ -19,13 +19,11 @@ def get_for_file(cls, file_path: PathLike, file_size: int) -> Optional['ChunkMet
 
 		if file_size <= 0:
 			return None
+		if not backup_config.chunking_enabled:
+			return None
 
-		# if backup_config.f4c_enabled and file_size >= backup_config.f4c_file_size_threshold:
-		# 	if backup_config.f4c_patterns_spec.match_file(file_path):
-		# 		return ChunkMethod.fixed_4k
-
-		if backup_config.cdc_enabled and file_size >= backup_config.cdc_file_size_threshold:
-			if backup_config.cdc_patterns_spec.match_file(file_path):
-				return ChunkMethod.cdc
+		for cfg in backup_config.chunking_rules:
+			if file_size >= cfg.file_size_threshold and cfg.patterns_spec.match_file(file_path):
+				return cfg.algorithm
 
 		return None
diff --git a/tests/test_fuzzy_run.py b/tests/test_fuzzy_run.py
@@ -36,8 +36,10 @@
 from prime_backup.action.validate_files_action import ValidateFilesAction
 from prime_backup.action.validate_filesets_action import ValidateFilesetsAction
 from prime_backup.compressors import CompressMethod
+from prime_backup.config.backup_config import ChunkingRule
 from prime_backup.config.config import Config
 from prime_backup.db.access import DbAccess
+from prime_backup.types.chunk_method import ChunkMethod
 from prime_backup.types.hash_method import HashMethod
 from prime_backup.types.operator import Operator
 from prime_backup.types.tar_format import TarFormat
@@ -495,9 +497,12 @@ def rm_test_files_dirs():
 		Config.get().backup.targets = [env_dir.name]
 		Config.get().backup.hash_method = HashMethod.xxh128
 		Config.get().backup.compress_method = CompressMethod.plain
-		Config.get().backup.cdc_enabled = True
-		Config.get().backup.cdc_file_size_threshold = 1 * 1048756  # 1MiB
-		Config.get().backup.cdc_patterns = ['**']
+		Config.get().backup.chunking_enabled = True
+		Config.get().backup.chunking_rules = [ChunkingRule(
+			algorithm=ChunkMethod.cdc,
+			file_size_threshold=1 * 1048576,  # 1MiB
+			patterns=['**'],
+		)]
 		DbAccess.init(create=True, migrate=False)
 
 		with contextlib.ExitStack() as es: