Skip to content

Commit 9da909b

Browse files
committed
more ChunkMethod for different chunk sizes
1 parent 70e02e3 commit 9da909b

4 files changed

Lines changed: 81 additions & 50 deletions

File tree

prime_backup/constants/chunk_constants.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
HASH_METHOD = 'blake3'
22

3-
CDC_AVG_SIZE = 32 * 1024 # 32KiB
4-
CDC_MIN_SIZE = CDC_AVG_SIZE // 4 # 8KiB
5-
CDC_MAX_SIZE = CDC_AVG_SIZE * 8 # 256KiB
6-
73
# 256chunk/group * (32KiB/chunk * 1.2) == ~10MiB/group --> ~200 groups for a 2GB file
84
CHUNK_GROUP_AVG_SIZE = 256
95
CHUNK_GROUP_MIN_SIZE = 64

prime_backup/types/chunk_method.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,21 @@
22
from pathlib import Path
33
from typing import Optional, IO
44

5-
from prime_backup.types.chunker_factory import CDCChunkerFactory, Fixed4KChunkerFactory
5+
from prime_backup.types.chunker_factory import CDCChunkerFactory, FixedSizeChunkerFactory
66
from prime_backup.utils.chunker import Chunker
77
from prime_backup.utils.path_like import PathLike
88

99

1010
class ChunkMethod(enum.Enum):
11-
cdc_32k = CDCChunkerFactory()
12-
fixed_4k = Fixed4KChunkerFactory()
11+
# Content-Defined Chunking (CDC)
12+
cdc_32k = CDCChunkerFactory(avg_size=32 * 1024, min_size=8 * 1024, max_size=256 * 1024)
13+
cdc_128k = CDCChunkerFactory(avg_size=128 * 1024, min_size=64 * 1024, max_size=1024 * 1024)
14+
cdc = cdc_32k
15+
16+
# Fixed-Size Chunking
17+
fixed_4k = FixedSizeChunkerFactory(4 * 1024)
18+
fixed_32k = FixedSizeChunkerFactory(32 * 1024)
19+
fixed_128k = FixedSizeChunkerFactory(128 * 1024)
1320

1421
@classmethod
1522
def get_for_file(cls, file_path: PathLike, file_size: int) -> Optional['ChunkMethod']:
Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
import dataclasses
12
from abc import abstractmethod, ABC
23
from pathlib import Path
34
from typing import IO
45

56
from typing_extensions import override
67

7-
from prime_backup.utils.chunker import Chunker, CDCFileChunker, CDCStreamChunker, Fixed4KFileChunker, Fixed4KStreamChunker
8+
from prime_backup.utils.chunker import Chunker, CDCFileChunker, CDCStreamChunker, FixedSizeFileChunker, FixedSizeStreamChunker, CDCChunkerConfig
89

910

1011
class ChunkerFactory(ABC):
@@ -17,21 +18,33 @@ def create_stream_chunker(self, stream, need_entire_file_hash: bool) -> Chunker:
1718
...
1819

1920

21+
@dataclasses.dataclass(frozen=True)
2022
class CDCChunkerFactory(ChunkerFactory):
23+
avg_size: int
24+
min_size: int
25+
max_size: int
26+
_config: CDCChunkerConfig = dataclasses.field(init=False, repr=False, compare=False)
27+
28+
def __post_init__(self):
29+
object.__setattr__(self, '_config', CDCChunkerConfig(self.avg_size, self.min_size, self.max_size))
30+
2131
@override
2232
def create_file_chunker(self, file_path: Path, need_entire_file_hash: bool) -> Chunker:
23-
return CDCFileChunker(file_path, need_entire_file_hash)
33+
return CDCFileChunker(self._config, file_path, need_entire_file_hash)
2434

2535
@override
2636
def create_stream_chunker(self, stream: IO[bytes], need_entire_file_hash: bool) -> Chunker:
27-
return CDCStreamChunker(stream, need_entire_file_hash)
37+
return CDCStreamChunker(self._config, stream, need_entire_file_hash)
38+
2839

40+
@dataclasses.dataclass(frozen=True)
41+
class FixedSizeChunkerFactory(ChunkerFactory):
42+
chunk_size: int
2943

30-
class Fixed4KChunkerFactory(ChunkerFactory):
3144
@override
3245
def create_file_chunker(self, file_path: Path, need_entire_file_hash: bool) -> Chunker:
33-
return Fixed4KFileChunker(file_path, need_entire_file_hash)
46+
return FixedSizeFileChunker(self.chunk_size, file_path, need_entire_file_hash)
3447

3548
@override
3649
def create_stream_chunker(self, stream, need_entire_file_hash: bool) -> Chunker:
37-
return Fixed4KStreamChunker(stream, need_entire_file_hash)
50+
return FixedSizeStreamChunker(self.chunk_size, stream, need_entire_file_hash)

prime_backup/utils/chunker.py

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from typing_extensions import override
88

9-
from prime_backup.constants import chunk_constants
109
from prime_backup.utils import misc_utils, hash_utils, chunk_utils
1110

1211
if TYPE_CHECKING:
@@ -85,71 +84,87 @@ def get_read_file_size(self) -> int:
8584

8685
# ======================== CDC Chunker ========================
8786

88-
def _create_cdc_engine() -> 'pyfastcdc.FastCDC':
89-
from pyfastcdc import FastCDC
90-
return FastCDC(
91-
avg_size=chunk_constants.CDC_AVG_SIZE,
92-
min_size=chunk_constants.CDC_MIN_SIZE,
93-
max_size=chunk_constants.CDC_MAX_SIZE,
94-
normalized_chunking=1,
95-
seed=0,
96-
)
87+
@dataclasses.dataclass(frozen=True)
88+
class CDCChunkerConfig:
89+
avg_size: int
90+
min_size: int
91+
max_size: int
9792

9893

99-
class CDCFileChunker(Chunker):
100-
def __init__(self, file_path: Path, need_entire_file_hash: bool = False):
94+
class _CDCChunker(Chunker, ABC):
95+
def __init__(self, cfg: CDCChunkerConfig, need_entire_file_hash: bool):
10196
super().__init__(need_entire_file_hash)
97+
self.cfg = cfg
98+
99+
def _create_cdc_engine(self) -> 'pyfastcdc.FastCDC':
100+
from pyfastcdc import FastCDC
101+
return FastCDC(
102+
avg_size=self.cfg.avg_size,
103+
min_size=self.cfg.min_size,
104+
max_size=self.cfg.max_size,
105+
normalized_chunking=1,
106+
seed=0,
107+
)
108+
109+
110+
class CDCFileChunker(_CDCChunker):
111+
def __init__(self, cfg: CDCChunkerConfig, file_path: Path, need_entire_file_hash: bool = False):
112+
super().__init__(cfg, need_entire_file_hash)
102113
self.file_path = file_path
103114

104115
@override
105116
def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
106-
for c in _create_cdc_engine().cut_file(self.file_path):
107-
misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}')
117+
for c in self._create_cdc_engine().cut_file(self.file_path):
118+
misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}')
108119
yield _RawChunk(offset=c.offset, length=c.length, data=c.data)
109120

110121

111-
class CDCStreamChunker(Chunker):
112-
def __init__(self, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False):
113-
super().__init__(need_entire_file_hash)
122+
class CDCStreamChunker(_CDCChunker):
123+
def __init__(self, cfg: CDCChunkerConfig, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False):
124+
super().__init__(cfg, need_entire_file_hash)
114125
self.stream = stream
115126

116127
@override
117128
def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
118-
for c in _create_cdc_engine().cut_stream(self.stream):
119-
misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}')
129+
for c in self._create_cdc_engine().cut_stream(self.stream):
130+
misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}')
120131
yield _RawChunk(offset=c.offset, length=c.length, data=c.data)
121132

122133

123-
# ======================== Fixed 4K Chunker ========================
134+
# ======================== Fixed Size Chunker ========================
135+
124136

125-
_FIXED_4K_SIZE = 4 * 1024 # 4KiB
137+
class _FixedSizeChunker(Chunker, ABC):
138+
def __init__(self, chunk_size: int, need_entire_file_hash: bool):
139+
super().__init__(need_entire_file_hash)
140+
self.chunk_size = chunk_size
126141

127-
def _cut_stream_by_fixed_4k(stream: IO[bytes]) -> Generator[_RawChunk, None, None]:
128-
offset = 0
129-
while True:
130-
buf = stream.read(_FIXED_4K_SIZE)
131-
if not buf:
132-
break
133-
yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf))
134-
offset += len(buf)
142+
def _cut_stream_by_fixed_size(self, stream: IO[bytes]) -> Generator[_RawChunk, None, None]:
143+
offset = 0
144+
while True:
145+
buf = stream.read(self.chunk_size)
146+
if not buf:
147+
break
148+
yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf))
149+
offset += len(buf)
135150

136151

137-
class Fixed4KFileChunker(Chunker):
138-
def __init__(self, file_path: Path, need_entire_file_hash: bool = False):
139-
super().__init__(need_entire_file_hash)
152+
class FixedSizeFileChunker(_FixedSizeChunker):
153+
def __init__(self, chunk_size: int, file_path: Path, need_entire_file_hash: bool = False):
154+
super().__init__(chunk_size, need_entire_file_hash)
140155
self.file_path = file_path
141156

142157
@override
143158
def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
144159
with open(self.file_path, 'rb') as f:
145-
yield from _cut_stream_by_fixed_4k(f)
160+
yield from self._cut_stream_by_fixed_size(f)
146161

147162

148-
class Fixed4KStreamChunker(Chunker):
149-
def __init__(self, stream: IO[bytes], need_entire_file_hash: bool = False):
150-
super().__init__(need_entire_file_hash)
163+
class FixedSizeStreamChunker(_FixedSizeChunker):
164+
def __init__(self, chunk_size: int, stream: IO[bytes], need_entire_file_hash: bool = False):
165+
super().__init__(chunk_size, need_entire_file_hash)
151166
self.stream = stream
152167

153168
@override
154169
def _iter_raw_chunks(self) -> Iterable[_RawChunk]:
155-
yield from _cut_stream_by_fixed_4k(self.stream)
170+
yield from self._cut_stream_by_fixed_size(self.stream)

0 commit comments

Comments
 (0)