|
6 | 6 |
|
7 | 7 | from typing_extensions import override |
8 | 8 |
|
9 | | -from prime_backup.constants import chunk_constants |
10 | 9 | from prime_backup.utils import misc_utils, hash_utils, chunk_utils |
11 | 10 |
|
12 | 11 | if TYPE_CHECKING: |
@@ -85,71 +84,87 @@ def get_read_file_size(self) -> int: |
85 | 84 |
|
86 | 85 | # ======================== CDC Chunker ======================== |
87 | 86 |
|
88 | | -def _create_cdc_engine() -> 'pyfastcdc.FastCDC': |
89 | | - from pyfastcdc import FastCDC |
90 | | - return FastCDC( |
91 | | - avg_size=chunk_constants.CDC_AVG_SIZE, |
92 | | - min_size=chunk_constants.CDC_MIN_SIZE, |
93 | | - max_size=chunk_constants.CDC_MAX_SIZE, |
94 | | - normalized_chunking=1, |
95 | | - seed=0, |
96 | | - ) |
| 87 | +@dataclasses.dataclass(frozen=True) |
| 88 | +class CDCChunkerConfig: |
| 89 | + avg_size: int |
| 90 | + min_size: int |
| 91 | + max_size: int |
97 | 92 |
|
98 | 93 |
|
99 | | -class CDCFileChunker(Chunker): |
100 | | - def __init__(self, file_path: Path, need_entire_file_hash: bool = False): |
| 94 | +class _CDCChunker(Chunker, ABC): |
| 95 | + def __init__(self, cfg: CDCChunkerConfig, need_entire_file_hash: bool): |
101 | 96 | super().__init__(need_entire_file_hash) |
| 97 | + self.cfg = cfg |
| 98 | + |
| 99 | + def _create_cdc_engine(self) -> 'pyfastcdc.FastCDC': |
| 100 | + from pyfastcdc import FastCDC |
| 101 | + return FastCDC( |
| 102 | + avg_size=self.cfg.avg_size, |
| 103 | + min_size=self.cfg.min_size, |
| 104 | + max_size=self.cfg.max_size, |
| 105 | + normalized_chunking=1, |
| 106 | + seed=0, |
| 107 | + ) |
| 108 | + |
| 109 | + |
| 110 | +class CDCFileChunker(_CDCChunker): |
| 111 | + def __init__(self, cfg: CDCChunkerConfig, file_path: Path, need_entire_file_hash: bool = False): |
| 112 | + super().__init__(cfg, need_entire_file_hash) |
102 | 113 | self.file_path = file_path |
103 | 114 |
|
104 | 115 | @override |
105 | 116 | def _iter_raw_chunks(self) -> Iterable[_RawChunk]: |
106 | | - for c in _create_cdc_engine().cut_file(self.file_path): |
107 | | - misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}') |
| 117 | + for c in self._create_cdc_engine().cut_file(self.file_path): |
| 118 | + misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}') |
108 | 119 | yield _RawChunk(offset=c.offset, length=c.length, data=c.data) |
109 | 120 |
|
110 | 121 |
|
111 | | -class CDCStreamChunker(Chunker): |
112 | | - def __init__(self, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False): |
113 | | - super().__init__(need_entire_file_hash) |
| 122 | +class CDCStreamChunker(_CDCChunker): |
| 123 | + def __init__(self, cfg: CDCChunkerConfig, stream: 'pyfastcdc.BinaryStreamReader', need_entire_file_hash: bool = False): |
| 124 | + super().__init__(cfg, need_entire_file_hash) |
114 | 125 | self.stream = stream |
115 | 126 |
|
116 | 127 | @override |
117 | 128 | def _iter_raw_chunks(self) -> Iterable[_RawChunk]: |
118 | | - for c in _create_cdc_engine().cut_stream(self.stream): |
119 | | - misc_utils.assert_true(c.length <= chunk_constants.CDC_MAX_SIZE, f'cdc cut chunk size too large: {c.length}') |
| 129 | + for c in self._create_cdc_engine().cut_stream(self.stream): |
| 130 | + misc_utils.assert_true(c.length <= self.cfg.max_size, f'cdc cut chunk size too large: {c.length}') |
120 | 131 | yield _RawChunk(offset=c.offset, length=c.length, data=c.data) |
121 | 132 |
|
122 | 133 |
|
123 | | -# ======================== Fixed 4K Chunker ======================== |
| 134 | +# ======================== Fixed Size Chunker ======================== |
| 135 | + |
124 | 136 |
|
125 | | -_FIXED_4K_SIZE = 4 * 1024 # 4KiB |
| 137 | +class _FixedSizeChunker(Chunker, ABC): |
| 138 | + def __init__(self, chunk_size: int, need_entire_file_hash: bool): |
| 139 | + super().__init__(need_entire_file_hash) |
| 140 | + self.chunk_size = chunk_size |
126 | 141 |
|
127 | | -def _cut_stream_by_fixed_4k(stream: IO[bytes]) -> Generator[_RawChunk, None, None]: |
128 | | - offset = 0 |
129 | | - while True: |
130 | | - buf = stream.read(_FIXED_4K_SIZE) |
131 | | - if not buf: |
132 | | - break |
133 | | - yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf)) |
134 | | - offset += len(buf) |
| 142 | + def _cut_stream_by_fixed_size(self, stream: IO[bytes]) -> Generator[_RawChunk, None, None]: |
| 143 | + offset = 0 |
| 144 | + while True: |
| 145 | + buf = stream.read(self.chunk_size) |
| 146 | + if not buf: |
| 147 | + break |
| 148 | + yield _RawChunk(offset=offset, length=len(buf), data=memoryview(buf)) |
| 149 | + offset += len(buf) |
135 | 150 |
|
136 | 151 |
|
137 | | -class Fixed4KFileChunker(Chunker): |
138 | | - def __init__(self, file_path: Path, need_entire_file_hash: bool = False): |
139 | | - super().__init__(need_entire_file_hash) |
| 152 | +class FixedSizeFileChunker(_FixedSizeChunker): |
| 153 | + def __init__(self, chunk_size: int, file_path: Path, need_entire_file_hash: bool = False): |
| 154 | + super().__init__(chunk_size, need_entire_file_hash) |
140 | 155 | self.file_path = file_path |
141 | 156 |
|
142 | 157 | @override |
143 | 158 | def _iter_raw_chunks(self) -> Iterable[_RawChunk]: |
144 | 159 | with open(self.file_path, 'rb') as f: |
145 | | - yield from _cut_stream_by_fixed_4k(f) |
| 160 | + yield from self._cut_stream_by_fixed_size(f) |
146 | 161 |
|
147 | 162 |
|
148 | | -class Fixed4KStreamChunker(Chunker): |
149 | | - def __init__(self, stream: IO[bytes], need_entire_file_hash: bool = False): |
150 | | - super().__init__(need_entire_file_hash) |
| 163 | +class FixedSizeStreamChunker(_FixedSizeChunker): |
| 164 | + def __init__(self, chunk_size: int, stream: IO[bytes], need_entire_file_hash: bool = False): |
| 165 | + super().__init__(chunk_size, need_entire_file_hash) |
151 | 166 | self.stream = stream |
152 | 167 |
|
153 | 168 | @override |
154 | 169 | def _iter_raw_chunks(self) -> Iterable[_RawChunk]: |
155 | | - yield from _cut_stream_by_fixed_4k(self.stream) |
| 170 | + yield from self._cut_stream_by_fixed_size(self.stream) |
0 commit comments