TISUnion
diff --git a/‎docs/chunking/index.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/chunking/index.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/chunking/index.zh.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/chunking/index.zh.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎prime_backup/action/migrate_hash_method_action.py‎
Lines changed: 232 additions & 52 deletions b/‎prime_backup/action/migrate_hash_method_action.py‎
Lines changed: 232 additions & 52 deletions
diff --git a/‎prime_backup/constants/chunk_constants.py‎
Lines changed: 1 addition & 1 deletion b/‎prime_backup/constants/chunk_constants.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎prime_backup/db/schema.py‎
Lines changed: 2 additions & 2 deletions b/‎prime_backup/db/schema.py‎
Lines changed: 2 additions & 2 deletions
@@ -54,7 +54,7 @@ Prime Backup still creates one blob record for the whole file, but the blob uses
 The current implementation works in the following order:
 
 1. Cut the file into chunks using the selected algorithm
-2. Calculate a BLAKE3 hash for each chunk
+2. Calculate a hash for each chunk using `backup.hash_method`
 3. Reuse chunks that already exist in storage
 4. Compress and write only the new chunks
 5. Bind the ordered chunk list back to the whole-file blob (by offset)
@@ -80,7 +80,7 @@ so the implementation groups consecutive chunks into chunk groups and stores two
 
 This reduces metadata overhead without changing the logical model
 
-Chunk hashes and chunk group hashes always use `blake3`, while the whole-file blob hash still follows `backup.hash_method`
+Chunk hashes follow `backup.hash_method`, the same as the whole-file blob hash. Chunk group hashes always use `sha256` regardless of `backup.hash_method`
 
 ## Compression and Performance
 
 
@@ -54,7 +54,7 @@ Prime Backup 仍会为整个文件创建一条数据对象（blob）记录，但
 当前实现的大致流程如下：
 
 1. 使用所选算法对文件进行切块
-2. 为每个数据块计算 BLAKE3 哈希值
+2. 为每个数据块计算哈希值（遵循 `backup.hash_method` 设置）
 3. 检查这些数据块是否已存在，已存在的直接复用
 4. 仅压缩并写入那些新的数据块
 5. 将数据块按顺序绑定回整个文件对应的数据对象（按偏移量）
@@ -80,7 +80,7 @@ Prime Backup 仍会为整个文件创建一条数据对象（blob）记录，但
 
 这在不改变逻辑模型的前提下减少了元数据开销
 
-数据块哈希与数据块组哈希始终使用 `blake3`，不过整个文件对应的数据对象哈希仍遵循 `backup.hash_method` 设置
+数据块哈希遵循 `backup.hash_method` 设置，与整个文件对应的数据对象哈希相同。数据块组哈希始终使用 `sha256`，不受 `backup.hash_method` 影响
 
 ## 压缩与性能
 
 
@@ -1,16 +1,24 @@
+import dataclasses
 import shutil
 import time
-from typing import List, Dict, Set
+from pathlib import Path
+from typing import Callable, Dict, Generic, List, Protocol, TypeVar, cast, Tuple
 
 from typing_extensions import override
 
 from prime_backup.action import Action
 from prime_backup.compressors import Compressor
+from prime_backup.db import schema
 from prime_backup.db.access import DbAccess
 from prime_backup.db.session import DbSession
+from prime_backup.db.values import BlobStorageMethod
 from prime_backup.exceptions import PrimeBackupError
 from prime_backup.types.hash_method import HashMethod
-from prime_backup.utils import blob_utils, hash_utils, collection_utils
+from prime_backup.utils import blob_utils, chunk_utils, hash_utils
+
+_READ_BUF_SIZE = 128 * 1024
+_FILE_BLOB_HASH_BATCH_SIZE = 200
+_T = TypeVar('_T')
 
 
 class HashCollisionError(PrimeBackupError):
@@ -20,84 +28,256 @@ class HashCollisionError(PrimeBackupError):
 	pass
 
 
+class _HashObject(Protocol):
+	id: int
+	hash: str
+
+
+@dataclasses.dataclass(frozen=True)
+class _HashMove(Generic[_T]):
+	object: _T
+	old_hash: str
+	new_hash: str
+	has_file_to_move: bool = False
+
+	@property
+	def id(self) -> int:
+		return cast(_HashObject, self.object).id
+
+	@property
+	def changed(self) -> bool:
+		return self.old_hash != self.new_hash
+
+
+class _MoveJournal:
+	def __init__(self):
+		self.__moves: List[Tuple[Path, Path]] = []
+
+	def move(self, src: Path, dst: Path):
+		if dst.exists():
+			raise FileExistsError(dst)
+		shutil.move(src, dst)
+		self.__moves.append((src, dst))
+
+	def rollback(self):
+		for src, dst in reversed(self.__moves):
+			shutil.move(dst, src)
+
+	def clear(self):
+		self.__moves.clear()
+
+
+def _changed_moves(moves: List[_HashMove[_T]]) -> List[_HashMove[_T]]:
+	return [move for move in moves if move.changed]
+
+
 class MigrateHashMethodAction(Action[None]):
 	def __init__(self, new_hash_method: HashMethod):
 		super().__init__()
 		self.new_hash_method = new_hash_method
+		self.__move_journal = _MoveJournal()
 
-	def __migrate_blobs(self, session: DbSession, blob_hashes: List[str], old_hashes: Set[str], processed_hash_mapping: Dict[str, str]):
-		hash_mapping: Dict[str, str] = {}
-		blobs = list(session.get_blobs_by_hashes(blob_hashes).values())
-
-		# calc blob hashes
-		for blob in blobs:
-			blob_path = blob_utils.get_blob_path(blob.hash)
-			with Compressor.create(blob.compress).open_decompressed(blob_path) as f:
-				sah = hash_utils.calc_reader_size_and_hash(f, hash_method=self.new_hash_method)
-			hash_mapping[blob.hash] = sah.hash
-			if sah.hash in old_hashes:
-				raise HashCollisionError(sah.hash)
-
-		# update the objects
-		for blob in blobs:
-			old_hash, new_hash = blob.hash, hash_mapping[blob.hash]
-			old_path = blob_utils.get_blob_path(old_hash)
-			new_path = blob_utils.get_blob_path(new_hash)
-			try:
-				shutil.move(old_path, new_path)
-			except Exception as e:
-				self.logger.error('Move blob ({} -> {}) from {!r} to {!r} failed: {}'.format(old_hash, new_hash, old_path, new_path, e))
-				raise
+	# ==================== Checks ====================
+
+	@classmethod
+	def __ensure_hashes_can_migrate(cls, moves: List[_HashMove], object_name: str):
+		old_hash_ids: Dict[str, int] = {}
+		new_hash_ids: Dict[str, int] = {}
+		for move in moves:
+			old_hash_ids[move.old_hash] = move.id
+			if (other_id := new_hash_ids.get(move.new_hash)) is not None:
+				raise HashCollisionError('{} hash collision: {}, object ids {} and {}'.format(object_name, move.new_hash, other_id, move.id))
+			new_hash_ids[move.new_hash] = move.id
+
+		for move in moves:
+			if not move.changed:
+				continue
+			if (old_owner_id := old_hash_ids.get(move.new_hash)) is not None and old_owner_id != move.id:
+				raise HashCollisionError('{} hash conflicts with existing old hash: {}, object ids {} and {}'.format(object_name, move.new_hash, old_owner_id, move.id))
+
+	@classmethod
+	def __ensure_paths_can_migrate(cls, moves: List[_HashMove], get_path: Callable[[str], Path]):
+		for move in moves:
+			if move.changed and move.has_file_to_move and (new_hash_path := get_path(move.new_hash)).exists():
+				raise FileExistsError(new_hash_path)
+
+	# ==================== Hash Calculation ====================
+
+	def __calc_direct_blob_new_hash(self, blob: schema.Blob) -> str:
+		with Compressor.create(blob.compress).open_decompressed(blob_utils.get_blob_path(blob.hash)) as f:
+			sah = hash_utils.calc_reader_size_and_hash(f, hash_method=self.new_hash_method)
+		if sah.size != blob.raw_size:
+			raise ValueError('raw size mismatch for blob {}, expect {}, found {}'.format(blob.hash, blob.raw_size, sah.size))
+		return sah.hash
+
+	def __calc_chunked_blob_new_hash(self, session: DbSession, blob: schema.Blob) -> str:
+		hasher = self.new_hash_method.value.create_hasher()
+		size = 0
+		for offset_chunk in session.get_blob_chunks(blob.id):
+			chunk = offset_chunk.chunk
+			chunk_size = 0
+			with Compressor.create(chunk.compress).open_decompressed(chunk_utils.get_chunk_path(chunk.hash)) as f:
+				while True:
+					buf = f.read(_READ_BUF_SIZE)
+					if len(buf) == 0:
+						break
+					hasher.update(buf)
+					chunk_size += len(buf)
+			if chunk_size != chunk.raw_size:
+				raise ValueError('raw size mismatch for chunk {}, expect {}, found {}'.format(chunk.hash, chunk.raw_size, chunk_size))
+			size += chunk_size
+		if size != blob.raw_size:
+			raise ValueError('raw size mismatch for chunked blob {}, expect {}, found {}'.format(blob.hash, blob.raw_size, size))
+		return hasher.hexdigest()
 
-			processed_hash_mapping[old_hash] = new_hash
-			blob.hash = new_hash
+	def __calc_chunk_new_hash(self, chunk: schema.Chunk) -> str:
+		with Compressor.create(chunk.compress).open_decompressed_bypassed(chunk_utils.get_chunk_path(chunk.hash)) as (reader, f):
+			sah = hash_utils.calc_reader_size_and_hash(f, hash_method=self.new_hash_method)
+		if reader.get_read_len() != chunk.stored_size:
+			raise ValueError('stored size mismatch for chunk {}, expect {}, found {}'.format(chunk.hash, chunk.stored_size, reader.get_read_len()))
+		if sah.size != chunk.raw_size:
+			raise ValueError('raw size mismatch for chunk {}, expect {}, found {}'.format(chunk.hash, chunk.raw_size, sah.size))
+		return sah.hash
 
+	# ==================== Move Collection ====================
+
+	def __collect_blob_moves(self, session: DbSession) -> List[_HashMove[schema.Blob]]:
+		moves: List[_HashMove[schema.Blob]] = []
+		blobs = session.list_blobs()
+		total = len(blobs)
+		for i, blob in enumerate(blobs):
+			if blob.storage_method == BlobStorageMethod.direct.value:
+				new_hash = self.__calc_direct_blob_new_hash(blob)
+				has_file_to_move = True
+			elif blob.storage_method == BlobStorageMethod.chunked.value:
+				new_hash = self.__calc_chunked_blob_new_hash(session, blob)
+				has_file_to_move = False
+			else:
+				raise ValueError('unsupported blob storage method {}'.format(blob.storage_method))
+
+			moves.append(_HashMove(object=blob, old_hash=blob.hash, new_hash=new_hash, has_file_to_move=has_file_to_move))
+			if (i + 1) % 1000 == 0 or i + 1 == total:
+				self.logger.info('Calculated blob hashes {} / {}'.format(i + 1, total))
+
+		self.__ensure_hashes_can_migrate(moves, 'blob')
+		self.__ensure_paths_can_migrate(moves, blob_utils.get_blob_path)
+		return _changed_moves(moves)
+
+	def __collect_chunk_moves(self, session: DbSession) -> List[_HashMove[schema.Chunk]]:
+		moves: List[_HashMove[schema.Chunk]] = []
+		chunks = session.list_chunks()
+		total = len(chunks)
+		for i, chunk in enumerate(chunks):
+			new_hash = self.__calc_chunk_new_hash(chunk)
+			moves.append(_HashMove(object=chunk, old_hash=chunk.hash, new_hash=new_hash, has_file_to_move=True))
+			if (i + 1) % 2000 == 0 or i + 1 == total:
+				self.logger.info('Calculated chunk hashes {} / {}'.format(i + 1, total))
+
+		self.__ensure_hashes_can_migrate(moves, 'chunk')
+		self.__ensure_paths_can_migrate(moves, chunk_utils.get_chunk_path)
+		return _changed_moves(moves)
+
+	def __collect_chunk_group_moves(self, session: DbSession) -> List[_HashMove[schema.ChunkGroup]]:
+		moves: List[_HashMove[schema.ChunkGroup]] = []
+		chunk_groups = session.list_chunk_groups()
+		chunks_by_group_id = session.get_chunk_group_chunks_batch([chunk_group.id for chunk_group in chunk_groups])
+		total = len(chunk_groups)
+		for i, chunk_group in enumerate(chunk_groups):
+			new_hash = chunk_utils.create_chunk_group_hash(
+				offset_chunk.chunk.hash
+				for offset_chunk in chunks_by_group_id[chunk_group.id]
+			)
+			moves.append(_HashMove(object=chunk_group, old_hash=chunk_group.hash, new_hash=new_hash))
+			if (i + 1) % 5000 == 0 or i + 1 == total:
+				self.logger.info('Calculated chunk group hashes {} / {}'.format(i + 1, total))
+
+		self.__ensure_hashes_can_migrate(moves, 'chunk group')
+		return _changed_moves(moves)
+
+	# ==================== DB Updates ====================
+
+	def __update_file_blob_hashes(self, session: DbSession, moves: List[_HashMove[schema.Blob]]):
+		hash_mapping = {move.old_hash: move.new_hash for move in moves}
+		if len(hash_mapping) == 0:
+			return
 		for file in session.get_file_by_blob_hashes(list(hash_mapping.keys())):
 			if file.blob_hash is None:
 				raise AssertionError('file {!r} has no blob_hash'.format(file))
 			file.blob_hash = hash_mapping[file.blob_hash]
 
+	# ==================== Migration Steps ====================
+
+	def __move_files_to_new_hashes(self, object_name: str, moves: List[_HashMove], get_path: Callable[[str], Path]):
+		total = sum(1 for move in moves if move.has_file_to_move)
+		done = 0
+		for move in moves:
+			if move.has_file_to_move:
+				self.__move_journal.move(get_path(move.old_hash), get_path(move.new_hash))
+				done += 1
+				if done % 2000 == 0 or done == total:
+					self.logger.info('Moved {} files {} / {}'.format(object_name, done, total))
+
+	def __migrate_blob_hashes(self, session: DbSession, moves: List[_HashMove[schema.Blob]]):
+		self.__move_files_to_new_hashes('blob', moves, blob_utils.get_blob_path)
+		for offset in range(0, len(moves), _FILE_BLOB_HASH_BATCH_SIZE):
+			batch = moves[offset:offset + _FILE_BLOB_HASH_BATCH_SIZE]
+			with session.no_auto_flush():
+				for move in batch:
+					move.object.hash = move.new_hash
+				self.__update_file_blob_hashes(session, batch)
+			session.flush()
+
+	def __migrate_chunk_hashes(self, session: DbSession, moves: List[_HashMove[schema.Chunk]]):
+		self.__move_files_to_new_hashes('chunk', moves, chunk_utils.get_chunk_path)
+		for move in moves:
+			move.object.hash = move.new_hash
+		session.flush()
+
+	def __migrate_chunk_group_hashes(self, session: DbSession, moves: List[_HashMove[schema.ChunkGroup]]):
+		for move in moves:
+			move.object.hash = move.new_hash
+		session.flush()
+
+	def __rollback_files(self):
+		self.__move_journal.rollback()
+
+	# ==================== Entry Point ====================
+
 	@override
 	def run(self) -> None:
-		processed_hash_mapping: Dict[str, str] = {}  # old -> new
+		t = time.time()
+		db_committed = False
 		try:
-			t = time.time()
 			with DbAccess.open_session() as session:
 				meta = session.get_db_meta()
 				if meta.hash_method == self.new_hash_method.name:
 					self.logger.info('Hash method of the database is already {}, no need to migrate'.format(self.new_hash_method.name))
 					return
 
 				self.logger.info('Migrating hash method from {} to {}'.format(meta.hash_method, self.new_hash_method.name))
+				blob_utils.prepare_blob_directories()
+				chunk_utils.prepare_chunk_directories()
 
-				# XXX: don't load all blobs into memory?
-				total_blob_count = session.get_blob_count()
-				all_hashes = session.get_all_blob_hashes()
-				all_hash_set = set(all_hashes)
-				cnt = 0
-				for blob_hashes in collection_utils.slicing_iterate(all_hashes, 1000):
-					cnt += len(blob_hashes)
-					self.logger.info('Migrating blobs {} / {}'.format(cnt, total_blob_count))
+				self.__migrate_blob_hashes(session, self.__collect_blob_moves(session))
+				self.__migrate_chunk_hashes(session, self.__collect_chunk_moves(session))
+				self.__migrate_chunk_group_hashes(session, self.__collect_chunk_group_moves(session))
 
-					self.__migrate_blobs(session, blob_hashes, all_hash_set, processed_hash_mapping)
-					session.flush_and_expunge_all()
-
-				meta = session.get_db_meta()  # get the meta again, cuz expunge_all() was called
+				meta = session.get_db_meta()
 				meta.hash_method = self.new_hash_method.name
 
-			self.logger.info('Syncing config and variables')
-			DbAccess.sync_hash_method()
-			self.config.backup.hash_method = self.new_hash_method
-
+			db_committed = True
+			try:
+				self.logger.info('Syncing config and variables')
+				DbAccess.sync_hash_method()
+				self.config.backup.hash_method = self.new_hash_method
+				self.__move_journal.clear()
+			except Exception:
+				self.logger.fatal('DB committed but in-memory sync failed, plugin restart required')
+				raise
 			self.logger.info('Hash method migration done, cost {}s'.format(round(time.time() - t, 2)))
 
 		except Exception:
 			self.logger.warning('Error occurs during migration, applying rollback')
-			for old_hash, new_hash in processed_hash_mapping.items():
-				old_path = blob_utils.get_blob_path(old_hash)
-				new_path = blob_utils.get_blob_path(new_hash)
-				try:
-					shutil.move(new_path, old_path)
-				except Exception as e_rb:
-					self.logger.error('Rollback failed for blob {} -> {}, suppressed: {}'.format(old_path, old_path, e_rb))
+			if not db_committed:
+				self.__rollback_files()
 			raise
@@ -1,4 +1,4 @@
-HASH_METHOD = 'blake3'
+CHUNK_GROUP_HASH_METHOD = 'sha256'
 
 # 256chunk/group * (32KiB/chunk * 1.2) == ~10MiB/group  -->  ~200 groups for a 2GB file
 CHUNK_GROUP_AVG_SIZE = 256
 
@@ -53,7 +53,7 @@ class Chunk(Base):
 	__table_args__ = {'sqlite_autoincrement': True}
 
 	id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
-	hash: Mapped[str] = mapped_column(HashHex, unique=True)  # blake3
+	hash: Mapped[str] = mapped_column(HashHex, unique=True)  # configured db hash method
 	compress: Mapped[str] = mapped_column(String)
 	raw_size: Mapped[int] = mapped_column(BigInteger)
 	stored_size: Mapped[int] = mapped_column(BigInteger)
@@ -66,7 +66,7 @@ class ChunkGroup(Base):
 	__table_args__ = {'sqlite_autoincrement': True}
 
 	id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
-	hash: Mapped[str] = mapped_column(HashHex, unique=True)  # blake3 of b'\0'.join(chunk hashes)
+	hash: Mapped[str] = mapped_column(HashHex, unique=True)  # sha256 of b'\0'.join(chunk hashes)
 	chunk_count: Mapped[int] = mapped_column(Integer)
 	chunk_raw_size_sum: Mapped[int] = mapped_column(BigInteger)
 	chunk_stored_size_sum: Mapped[int] = mapped_column(BigInteger)  # sum of unique chunk stored sizes
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-HASH_METHOD = 'blake3'`
	`1`	`+CHUNK_GROUP_HASH_METHOD = 'sha256'`
`2`	`2`
`3`	`3`	`# 256chunk/group * (32KiB/chunk * 1.2) == ~10MiB/group --> ~200 groups for a 2GB file`
`4`	`4`	`CHUNK_GROUP_AVG_SIZE = 256`