|
3 | 3 | import uuid |
4 | 4 | import shutil |
5 | 5 | import os |
| 6 | +import asyncio |
6 | 7 | from typing import Optional |
7 | 8 |
|
8 | 9 | from fastapi import APIRouter, Depends, Query |
|
11 | 12 |
|
12 | 13 | from app.core.exception import ErrorCodes, BusinessError, SuccessResponse, transaction |
13 | 14 | from app.core.logging import get_logger |
14 | | -from app.db.models import Dataset |
| 15 | +from app.db.models import Dataset, DatasetFiles |
15 | 16 | from app.db.models.data_collection import CollectionTask, TaskExecution, CollectionTemplate |
16 | 17 | from app.db.session import get_db |
17 | 18 | from app.module.collection.client.datax_client import DataxClient |
|
28 | 29 | logger = get_logger(__name__) |
29 | 30 |
|
30 | 31 |
|
| 32 | +async def is_hard_link(file_path: str) -> bool: |
| 33 | + """检查文件是否是硬链接""" |
| 34 | + try: |
| 35 | + stat_info = await asyncio.to_thread(os.stat, file_path) |
| 36 | + # 如果链接数大于1,说明是硬链接 |
| 37 | + return stat_info.st_nlink > 1 |
| 38 | + except OSError: |
| 39 | + return False |
| 40 | + |
| 41 | + |
| 42 | +async def convert_hardlink_to_real_file(file_path: str) -> bool: |
| 43 | + """ |
| 44 | + 将硬链接转换为实体文件 |
| 45 | + 通过读取并重新写入文件内容,创建一个独立的副本 |
| 46 | + """ |
| 47 | + try: |
| 48 | + # 创建临时文件 |
| 49 | + temp_path = f"{file_path}.tmp" |
| 50 | + # 使用 shutil.copy2 创建副本(保留元数据) |
| 51 | + await asyncio.to_thread(shutil.copy2, file_path, temp_path) |
| 52 | + # 删除原文件(硬链接) |
| 53 | + await asyncio.to_thread(os.unlink, file_path) |
| 54 | + # 重命名临时文件为原文件名 |
| 55 | + await asyncio.to_thread(os.replace, temp_path, file_path) |
| 56 | + return True |
| 57 | + except OSError as e: |
| 58 | + logger.warning(f"Failed to convert hard link to real file {file_path}: {e}") |
| 59 | + # 清理临时文件(如果存在) |
| 60 | + if os.path.exists(f"{file_path}.tmp"): |
| 61 | + try: |
| 62 | + await asyncio.to_thread(os.remove, f"{file_path}.tmp") |
| 63 | + except OSError: |
| 64 | + pass |
| 65 | + return False |
| 66 | + |
| 67 | + |
| 68 | +async def convert_dataset_hardlinks_before_delete(task_id: str, db: AsyncSession) -> int: |
| 69 | + """ |
| 70 | + 删除归集任务前,将数据集中的硬链接文件转换为实体文件 |
| 71 | +
|
| 72 | + Args: |
| 73 | + task_id: 归集任务ID |
| 74 | + db: 数据库会话 |
| 75 | +
|
| 76 | + Returns: |
| 77 | + 转换成功的文件数量 |
| 78 | + """ |
| 79 | + try: |
| 80 | + # 查找所有数据集文件(通过文件路径匹配任务ID) |
| 81 | + # 注意:归集任务的源文件路径是 tmp/dataset/{task_id}/ |
| 82 | + # 我们需要找到数据集中所有以这个路径为源的文件 |
| 83 | + source_prefix = f"tmp/dataset/{task_id}/" |
| 84 | + |
| 85 | + # 查询所有可能相关的数据集文件 |
| 86 | + result = await db.execute( |
| 87 | + select(DatasetFiles).where( |
| 88 | + DatasetFiles.file_path.like(f"%/dataset/%"), |
| 89 | + DatasetFiles.status == "ACTIVE" |
| 90 | + ) |
| 91 | + ) |
| 92 | + dataset_files = result.scalars().all() |
| 93 | + |
| 94 | + converted_count = 0 |
| 95 | + for dataset_file in dataset_files: |
| 96 | + file_path = dataset_file.file_path |
| 97 | + if not file_path: |
| 98 | + continue |
| 99 | + |
| 100 | + # 检查文件是否是硬链接 |
| 101 | + if await is_hard_link(file_path): |
| 102 | + logger.info(f"Converting hard link to real file: {file_path}") |
| 103 | + success = await convert_hardlink_to_real_file(file_path) |
| 104 | + if success: |
| 105 | + converted_count += 1 |
| 106 | + else: |
| 107 | + logger.warning(f"Failed to convert hard link: {file_path}") |
| 108 | + |
| 109 | + if converted_count > 0: |
| 110 | + logger.info(f"Converted {converted_count} hard link(s) to real file(s) for task {task_id}") |
| 111 | + |
| 112 | + return converted_count |
| 113 | + except Exception as e: |
| 114 | + logger.error(f"Error converting hard links for task {task_id}: {e}", exc_info=True) |
| 115 | + return 0 |
| 116 | + |
| 117 | + |
31 | 118 | @router.post("", response_model=StandardResponse[CollectionTaskBase], operation_id="create_collect_task", tags=["mcp"]) |
32 | 119 | async def create_task( |
33 | 120 | request: CollectionTaskCreate, |
@@ -263,7 +350,10 @@ async def delete_collection_tasks( |
263 | 350 | # 删除任务 |
264 | 351 | await db.delete(task) |
265 | 352 |
|
266 | | - # 事务提交后,删除文件系统和调度 |
| 353 | + # 事务提交后,先转换硬链接,再删除文件系统和调度 |
| 354 | + logger.info(f"Converting hard links before deleting task {task_id}") |
| 355 | + await convert_dataset_hardlinks_before_delete(task_id, db) |
| 356 | + |
267 | 357 | remove_collection_task(task_id) |
268 | 358 |
|
269 | 359 | target_path = f"/dataset/local/{task_id}" |
|
0 commit comments