|
8 | 8 | from sqlalchemy import select |
9 | 9 | from sqlalchemy.ext.asyncio import AsyncSession |
10 | 10 |
|
| 11 | +from app.db.models.base_entity import LineageNode, LineageEdge |
11 | 12 | from app.db.models.data_synthesis import ( |
12 | 13 | DataSynthInstance, |
13 | 14 | DataSynthesisFileInstance, |
14 | 15 | DataSynthesisChunkInstance, |
15 | 16 | SynthesisData, |
16 | 17 | ) |
17 | | -from app.db.models.dataset_management import DatasetFiles |
| 18 | +from app.db.models.dataset_management import DatasetFiles, Dataset |
18 | 19 | from app.db.session import logger |
19 | 20 | from app.module.generation.schema.generation import Config, SyntheConfig |
20 | 21 | from app.module.generation.service.prompt import ( |
|
26 | 27 | from app.module.shared.util.model_chat import extract_json_substring |
27 | 28 | from app.module.shared.llm import LLMFactory |
28 | 29 | from app.module.system.service.common_service import get_model_by_id |
| 30 | +from app.module.shared.common.lineage import LineageService |
| 31 | +from app.module.shared.schema import NodeType, EdgeType |
29 | 32 |
|
30 | 33 |
|
31 | 34 | def _filter_docs(split_docs, chunk_size): |
@@ -657,3 +660,52 @@ async def _increment_processed_chunks(self, file_task_id: str, delta: int) -> No |
657 | 660 | file_task.processed_chunks = new_value |
658 | 661 | await self.db.commit() |
659 | 662 | await self.db.refresh(file_task) |
| 663 | + |
| 664 | + async def add_synthesis_to_graph(self, db: AsyncSession, task_id: str, dest_dataset_id: str) -> None: |
| 665 | + """记录数据合成血缘关系:源数据集 -> 合成数据集 via DATA_SYNTHESIS""" |
| 666 | + try: |
| 667 | + # 获取任务和目标数据集信息 |
| 668 | + task = await self.db.get(DataSynthInstance, task_id) |
| 669 | + src_dataset_result = await db.execute( |
| 670 | + select(DatasetFiles.dataset_id) |
| 671 | + .join(DataSynthesisFileInstance, DatasetFiles.id == DataSynthesisFileInstance.source_file_id) |
| 672 | + .where(DataSynthesisFileInstance.synthesis_instance_id == task_id) |
| 673 | + .limit(1) |
| 674 | + ) |
| 675 | + src_dataset_id = src_dataset_result.scalar_one_or_none() |
| 676 | + src_dataset = await self.db.get(Dataset, src_dataset_id) |
| 677 | + dst_dataset = await self.db.get(Dataset, dest_dataset_id) |
| 678 | + |
| 679 | + if not task or not dst_dataset: |
| 680 | + logger.warning("Missing task or destination dataset for lineage graph") |
| 681 | + return |
| 682 | + |
| 683 | + src_node = LineageNode( |
| 684 | + id=src_dataset.id, |
| 685 | + node_type=NodeType.DATASET.value, |
| 686 | + name=src_dataset.name, |
| 687 | + description=src_dataset.description |
| 688 | + ) |
| 689 | + dest_node = LineageNode( |
| 690 | + id=dst_dataset.id, |
| 691 | + node_type=NodeType.DATASET.value, |
| 692 | + name=dst_dataset.name, |
| 693 | + description=dst_dataset.description |
| 694 | + ) |
| 695 | + synthesis_edge = LineageEdge( |
| 696 | + process_id=task_id, |
| 697 | + name=task.name, |
| 698 | + edge_type=EdgeType.DATA_SYNTHESIS.value, |
| 699 | + description=task.description, |
| 700 | + from_node_id=src_node.id, |
| 701 | + to_node_id=dst_dataset.id |
| 702 | + ) |
| 703 | + |
| 704 | + # 生成血缘图 |
| 705 | + lineage_service = LineageService(db=db) |
| 706 | + await lineage_service.generate_graph(src_node, synthesis_edge, dest_node) |
| 707 | + await self.db.commit() |
| 708 | + |
| 709 | + logger.info(f"Added synthesis lineage: {src_node.name} -> {dest_dataset.name}") |
| 710 | + except Exception as exc: |
| 711 | + logger.error(f"Failed to add synthesis lineage: {exc}") |
0 commit comments