more ChunkMethod for different chunk sizes

Fallen-Breath · Fallen-Breath · commit 9da909b4b35d · 2026-05-02T01:54:13.000+08:00
diff --git a/prime_backup/constants/chunk_constants.py b/prime_backup/constants/chunk_constants.py
@@ -1,9 +1,5 @@
 HASH_METHOD = 'blake3'
 
-CDC_AVG_SIZE = 32 * 1024          # 32KiB
-CDC_MIN_SIZE = CDC_AVG_SIZE // 4  # 8KiB
-CDC_MAX_SIZE = CDC_AVG_SIZE * 8   # 256KiB
-
 # 256chunk/group * (32KiB/chunk * 1.2) == ~10MiB/group  -->  ~200 groups for a 2GB file
 CHUNK_GROUP_AVG_SIZE = 256
 CHUNK_GROUP_MIN_SIZE = 64
diff --git a/prime_backup/types/chunk_method.py b/prime_backup/types/chunk_method.py
@@ -2,14 +2,21 @@
 from pathlib import Path
 from typing import Optional, IO
 
-from prime_backup.types.chunker_factory import CDCChunkerFactory, Fixed4KChunkerFactory
+from prime_backup.types.chunker_factory import CDCChunkerFactory, FixedSizeChunkerFactory
 from prime_backup.utils.chunker import Chunker
 from prime_backup.utils.path_like import PathLike
 
 
 class ChunkMethod(enum.Enum):
-	cdc_32k = CDCChunkerFactory()
-	fixed_4k = Fixed4KChunkerFactory()
+	# Content-Defined Chunking (CDC)
+	cdc_32k = CDCChunkerFactory(avg_size=32 * 1024, min_size=8 * 1024, max_size=256 * 1024)
+	cdc_128k = CDCChunkerFactory(avg_size=128 * 1024, min_size=64 * 1024, max_size=1024 * 1024)
+	cdc = cdc_32k
+
+	# Fixed-Size Chunking
+	fixed_4k = FixedSizeChunkerFactory(4 * 1024)
+	fixed_32k = FixedSizeChunkerFactory(32 * 1024)
+	fixed_128k = FixedSizeChunkerFactory(128 * 1024)
 
 	@classmethod
 	def get_for_file(cls, file_path: PathLike, file_size: int) -> Optional['ChunkMethod']:
diff --git a/prime_backup/types/chunker_factory.py b/prime_backup/types/chunker_factory.py
@@ -1,10 +1,11 @@
+import dataclasses
 from abc import abstractmethod, ABC
 from pathlib import Path
 from typing import IO
 
 from typing_extensions import override
 
-from prime_backup.utils.chunker import Chunker, CDCFileChunker, CDCStreamChunker, Fixed4KFileChunker, Fixed4KStreamChunker
+from prime_backup.utils.chunker import Chunker, CDCFileChunker, CDCStreamChunker, FixedSizeFileChunker, FixedSizeStreamChunker, CDCChunkerConfig
 
 
 class ChunkerFactory(ABC):
@@ -17,21 +18,33 @@ def create_stream_chunker(self, stream, need_entire_file_hash: bool) -> Chunker:
 		...
 
 
+@dataclasses.dataclass(frozen=True)
 class CDCChunkerFactory(ChunkerFactory):
+	avg_size: int
+	min_size: int
+	max_size: int
+	_config: CDCChunkerConfig = dataclasses.field(init=False, repr=False, compare=False)
+
+	def __post_init__(self):
+		object.__setattr__(self, '_config', CDCChunkerConfig(self.avg_size, self.min_size, self.max_size))
+
 	@override
 	def create_file_chunker(self, file_path: Path, need_entire_file_hash: bool) -> Chunker:
-		return CDCFileChunker(file_path, need_entire_file_hash)
+		return CDCFileChunker(self._config, file_path, need_entire_file_hash)
 
 	@override
 	def create_stream_chunker(self, stream: IO[bytes], need_entire_file_hash: bool) -> Chunker:
-		return CDCStreamChunker(stream, need_entire_file_hash)
+		return CDCStreamChunker(self._config, stream, need_entire_file_hash)
+
 
+@dataclasses.dataclass(frozen=True)
+class FixedSizeChunkerFactory(ChunkerFactory):
+	chunk_size: int
 
-class Fixed4KChunkerFactory(ChunkerFactory):
 	@override
 	def create_file_chunker(self, file_path: Path, need_entire_file_hash: bool) -> Chunker:
-		return Fixed4KFileChunker(file_path, need_entire_file_hash)
+		return FixedSizeFileChunker(self.chunk_size, file_path, need_entire_file_hash)
 
 	@override
 	def create_stream_chunker(self, stream, need_entire_file_hash: bool) -> Chunker:
-		return Fixed4KStreamChunker(stream, need_entire_file_hash)
+		return FixedSizeStreamChunker(self.chunk_size, stream, need_entire_file_hash)
diff --git a/prime_backup/utils/chunker.py b/prime_backup/utils/chunker.py
@@ -6,7 +6,6 @@
 
 from typing_extensions import override
 
-from prime_backup.constants import chunk_constants
 from prime_backup.utils import misc_utils, hash_utils, chunk_utils
 
 if TYPE_CHECKING:
@@ -85,71 +84,87 @@ def get_read_file_size(self) -> int:
 
 # ======================== CDC Chunker ========================
 
-def _create_cdc_engine() -> 'pyfastcdc.FastCDC':
-	from pyfastcdc import FastCDC
-	return FastCDC(
-		avg_size=chunk_constants.CDC_AVG_SIZE,
-		min_size=chunk_constants.CDC_MIN_SIZE,
-		max_size=chunk_constants.CDC_MAX_SIZE,
-		normalized_chunking=1,
-		seed=0,
-	)
+@dataclasses.dataclass(frozen=True)
+class CDCChunkerConfig:
+	avg_size: int
+	min_size: int
+	max_size: int
 
 
-class CDCFileChunker(Chunker):
-	def __init__(self, file_path: Path, need_entire_file_hash: bool = False):
+class _CDCChunker(Chunker, ABC):
+	def __init__(self, cfg: CDCChunkerConfig, need_entire_file_hash: bool):
 		super().__init__(need_entire_file_hash)
+		self.cfg = cfg
+
+	def _create_cdc_engine(self) -> 'pyfastcdc.FastCDC':
+		from pyfastcdc import FastCDC
+		return FastCDC(
+			avg_size=self.cfg.avg_size,
+			min_size=self.cfg.min_size,
+			max_size=self.cfg.max_size,
+			normalized_chunking=1,
+			seed=0,
+		)
+
+
+class CDCFileChunker(_CDCChunker):
+	def __init__(self, cfg: CDCChunkerConfig, file_path: Path, need_entire_file_hash: bool = False):
+		super().__init__(cfg, need_entire_file_hash)
 		self.file_path = file_path
 
 	@override
 	def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
-		for c in _create_cdc_engine().cut_file(self.file_path):
-			misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}')
+		for c in self._create_cdc_engine().cut_file(self.file_path):
+			misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}')
 			yield _RawChunk(offset=c.offset, length=c.length, data=c.data)
 
 
-class CDCStreamChunker(Chunker):
-	def __init__(self, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False):
-		super().__init__(need_entire_file_hash)
+class CDCStreamChunker(_CDCChunker):
+	def __init__(self, cfg: CDCChunkerConfig, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False):
+		super().__init__(cfg, need_entire_file_hash)
 		self.stream = stream
 
 	@override
 	def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
-		for c in _create_cdc_engine().cut_stream(self.stream):
-			misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}')
+		for c in self._create_cdc_engine().cut_stream(self.stream):
+			misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}')
 			yield _RawChunk(offset=c.offset, length=c.length, data=c.data)
 
 
-# ======================== Fixed 4K Chunker ========================
+# ======================== Fixed Size Chunker ========================
+
 
-_FIXED_4K_SIZE = 4 * 1024  # 4KiB
+class _FixedSizeChunker(Chunker, ABC):
+	def __init__(self, chunk_size: int, need_entire_file_hash: bool):
+		super().__init__(need_entire_file_hash)
+		self.chunk_size = chunk_size
 
-def _cut_stream_by_fixed_4k(stream: IO[bytes]) -> Generator[_RawChunk, None, None]:
-	offset = 0
-	while True:
-		buf = stream.read(_FIXED_4K_SIZE)
-		if not buf:
-			break
-		yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf))
-		offset += len(buf)
+	def _cut_stream_by_fixed_size(self, stream: IO[bytes]) -> Generator[_RawChunk, None, None]:
+		offset = 0
+		while True:
+			buf = stream.read(self.chunk_size)
+			if not buf:
+				break
+			yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf))
+			offset += len(buf)
 
 
-class Fixed4KFileChunker(Chunker):
-	def __init__(self, file_path: Path, need_entire_file_hash: bool = False):
-		super().__init__(need_entire_file_hash)
+class FixedSizeFileChunker(_FixedSizeChunker):
+	def __init__(self, chunk_size: int, file_path: Path, need_entire_file_hash: bool = False):
+		super().__init__(chunk_size, need_entire_file_hash)
 		self.file_path = file_path
 
 	@override
 	def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
 		with open(self.file_path, 'rb') as f:
-			yield from _cut_stream_by_fixed_4k(f)
+			yield from self._cut_stream_by_fixed_size(f)
 
 
-class Fixed4KStreamChunker(Chunker):
-	def __init__(self, stream: IO[bytes], need_entire_file_hash: bool = False):
-		super().__init__(need_entire_file_hash)
+class FixedSizeStreamChunker(_FixedSizeChunker):
+	def __init__(self, chunk_size: int, stream: IO[bytes], need_entire_file_hash: bool = False):
+		super().__init__(chunk_size, need_entire_file_hash)
 		self.stream = stream
 
 	@override
 	def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
-		yield from _cut_stream_by_fixed_4k(self.stream)
+		yield from self._cut_stream_by_fixed_size(self.stream)