Skip to content

Commit 7710856

Browse files
committed
Merge branch 'main' into dev
2 parents 477af37 + 2138dc8 commit 7710856

70 files changed

Lines changed: 5223 additions & 1240 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/docker-images-reusable.yml

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,15 @@ jobs:
3939
elif [[ $GITHUB_REF == refs/heads/main ]]; then
4040
echo "TAGS=$BASE_IMAGE:amd64" >> $GITHUB_OUTPUT
4141
else
42-
echo "TAGS=$BASE_IMAGE:amd64-temp" >> $GITHUB_OUTPUT
42+
# 使用分支名作为 tag
43+
if [[ $GITHUB_REF == refs/heads/* ]]; then
44+
BRANCH_NAME=${GITHUB_REF#refs/heads/}
45+
# 清理分支名,将特殊字符替换为连字符
46+
BRANCH_TAG=$(echo "$BRANCH_NAME" | sed 's/[^a-zA-Z0-9._-]/-/g')
47+
echo "TAGS=$BASE_IMAGE:amd64-$BRANCH_TAG" >> $GITHUB_OUTPUT
48+
else
49+
echo "TAGS=$BASE_IMAGE:amd64-temp" >> $GITHUB_OUTPUT
50+
fi
4351
fi
4452
4553
- name: Build Docker Image
@@ -80,7 +88,15 @@ jobs:
8088
elif [[ $GITHUB_REF == refs/heads/main ]]; then
8189
echo "TAGS=$BASE_IMAGE:arm64" >> $GITHUB_OUTPUT
8290
else
83-
echo "TAGS=$BASE_IMAGE:arm64-temp" >> $GITHUB_OUTPUT
91+
# 使用分支名作为 tag
92+
if [[ $GITHUB_REF == refs/heads/* ]]; then
93+
BRANCH_NAME=${GITHUB_REF#refs/heads/}
94+
# 清理分支名,将特殊字符替换为连字符
95+
BRANCH_TAG=$(echo "$BRANCH_NAME" | sed 's/[^a-zA-Z0-9._-]/-/g')
96+
echo "TAGS=$BASE_IMAGE:arm64-$BRANCH_TAG" >> $GITHUB_OUTPUT
97+
else
98+
echo "TAGS=$BASE_IMAGE:arm64-temp" >> $GITHUB_OUTPUT
99+
fi
84100
fi
85101
86102
- name: Build Docker Image
@@ -120,9 +136,19 @@ jobs:
120136
echo "ARM_TAGS=$BASE_IMAGE:arm64" >> $GITHUB_OUTPUT
121137
echo "AMD_TAGS=$BASE_IMAGE:amd64" >> $GITHUB_OUTPUT
122138
else
123-
echo "TAGS=$BASE_IMAGE:temp" >> $GITHUB_OUTPUT
124-
echo "ARM_TAGS=$BASE_IMAGE:arm64-temp" >> $GITHUB_OUTPUT
125-
echo "AMD_TAGS=$BASE_IMAGE:amd64-temp" >> $GITHUB_OUTPUT
139+
# 使用分支名作为 tag
140+
if [[ $GITHUB_REF == refs/heads/* ]]; then
141+
BRANCH_NAME=${GITHUB_REF#refs/heads/}
142+
# 清理分支名,将特殊字符替换为连字符
143+
BRANCH_TAG=$(echo "$BRANCH_NAME" | sed 's/[^a-zA-Z0-9._-]/-/g')
144+
echo "TAGS=$BASE_IMAGE:$BRANCH_TAG" >> $GITHUB_OUTPUT
145+
echo "ARM_TAGS=$BASE_IMAGE:arm64-$BRANCH_TAG" >> $GITHUB_OUTPUT
146+
echo "AMD_TAGS=$BASE_IMAGE:amd64-$BRANCH_TAG" >> $GITHUB_OUTPUT
147+
else
148+
echo "TAGS=$BASE_IMAGE:temp" >> $GITHUB_OUTPUT
149+
echo "ARM_TAGS=$BASE_IMAGE:arm64-temp" >> $GITHUB_OUTPUT
150+
echo "AMD_TAGS=$BASE_IMAGE:amd64-temp" >> $GITHUB_OUTPUT
151+
fi
126152
fi
127153
128154
- name: Manifest Docker Image

SECURITY.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Security Policy
2+
3+
## Supported Versions
4+
5+
Use this section to tell people about which versions of your project are
6+
currently being supported with security updates.
7+
8+
| Version | Supported |
9+
| ------- | ------------------ |
10+
| 5.1.x | :white_check_mark: |
11+
| 5.0.x | :x: |
12+
| 4.0.x | :white_check_mark: |
13+
| < 4.0 | :x: |
14+
15+
## Reporting a Vulnerability
16+
17+
Use this section to tell people how to report a vulnerability.
18+
19+
Tell them where to go, how often they can expect to get an update on a
20+
reported vulnerability, what to expect if the vulnerability is accepted or
21+
declined, etc.

backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/application/CleaningTaskService.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
import com.datamate.cleaning.domain.repository.OperatorInstanceRepository;
1111
import com.datamate.cleaning.infrastructure.validator.CleanTaskValidator;
1212
import com.datamate.cleaning.interfaces.dto.*;
13+
import com.datamate.common.domain.enums.EdgeType;
14+
import com.datamate.common.domain.enums.NodeType;
15+
import com.datamate.common.domain.model.LineageEdge;
16+
import com.datamate.common.domain.model.LineageNode;
17+
import com.datamate.common.domain.service.LineageService;
1318
import com.datamate.common.infrastructure.exception.BusinessException;
1419
import com.datamate.common.infrastructure.exception.SystemErrorCode;
1520
import com.datamate.common.interfaces.PagedResponse;
@@ -73,6 +78,8 @@ public class CleaningTaskService {
7378

7479
private final CleanTaskValidator cleanTaskValidator;
7580

81+
private final LineageService lineageService;
82+
7683
private final String DATASET_PATH = "/dataset";
7784

7885
private final String FLOW_PATH = "/flow";
@@ -134,6 +141,8 @@ public CleaningTaskDto createTask(CreateCleaningTaskRequest request) {
134141
task.setBeforeSize(srcDataset.getSizeBytes());
135142
task.setFileCount(srcDataset.getFileCount().intValue());
136143
cleaningTaskRepo.insertTask(task);
144+
// 记录血缘关系
145+
addCleaningToGraph(srcDataset, task, destDataset);
137146

138147
operatorInstanceRepo.insertInstance(taskId, request.getInstance());
139148
operatorRepo.incrementUsageCount(request.getInstance().stream()
@@ -146,6 +155,30 @@ public CleaningTaskDto createTask(CreateCleaningTaskRequest request) {
146155
return task;
147156
}
148157

158+
private void addCleaningToGraph(Dataset srcDataset, CleaningTaskDto task, Dataset destDataset) {
159+
LineageNode fromNode = new LineageNode();
160+
fromNode.setId(srcDataset.getId());
161+
fromNode.setName(srcDataset.getName());
162+
fromNode.setDescription(srcDataset.getDescription());
163+
fromNode.setNodeType(NodeType.DATASET);
164+
165+
LineageNode toNode = new LineageNode();
166+
toNode.setId(destDataset.getId());
167+
toNode.setName(destDataset.getName());
168+
toNode.setDescription(destDataset.getDescription());
169+
toNode.setNodeType(NodeType.DATASET);
170+
171+
LineageEdge edge = new LineageEdge();
172+
edge.setProcessId(task.getId());
173+
edge.setName(task.getName());
174+
edge.setDescription(task.getDescription());
175+
edge.setEdgeType(EdgeType.DATA_CLEANING);
176+
edge.setFromNodeId(fromNode.getId());
177+
edge.setToNodeId(toNode.getId());
178+
179+
lineageService.generateGraph(fromNode, edge, toNode);
180+
}
181+
149182
public CleaningTaskDto getTask(String taskId) {
150183
CleaningTaskDto task = cleaningTaskRepo.findTaskById(taskId);
151184
setProcess(task);

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
import com.baomidou.mybatisplus.core.metadata.IPage;
44
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
5+
import com.datamate.common.domain.enums.EdgeType;
6+
import com.datamate.common.domain.enums.NodeType;
7+
import com.datamate.common.domain.model.LineageEdge;
8+
import com.datamate.common.domain.model.LineageNode;
9+
import com.datamate.common.domain.service.LineageService;
510
import com.datamate.common.domain.utils.ChunksSaver;
611
import com.datamate.common.setting.application.SysParamApplicationService;
712
import com.datamate.datamanagement.interfaces.dto.*;
@@ -17,7 +22,6 @@
1722
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
1823
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
1924
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
20-
import com.datamate.datamanagement.interfaces.dto.*;
2125
import com.fasterxml.jackson.core.JsonProcessingException;
2226
import com.fasterxml.jackson.databind.ObjectMapper;
2327
import com.fasterxml.jackson.databind.SerializationFeature;
@@ -54,6 +58,7 @@ public class DatasetApplicationService {
5458
private final CollectionTaskClient collectionTaskClient;
5559
private final DatasetFileApplicationService datasetFileApplicationService;
5660
private final SysParamApplicationService sysParamService;
61+
private final LineageService lineageService;
5762

5863
@Value("${datamate.data-management.base-path:/dataset}")
5964
private String datasetBasePath;
@@ -72,6 +77,8 @@ public Dataset createDataset(CreateDatasetRequest createDatasetRequest) {
7277
dataset.setTags(processTagNames(createDatasetRequest.getTags()));
7378
}
7479
datasetRepository.save(dataset);
80+
// 记录血缘关系
81+
addDatasetToGraph(dataset, null);
7582

7683
//todo 需要解耦这块逻辑
7784
if (StringUtils.hasText(createDatasetRequest.getDataSource())) {
@@ -81,6 +88,43 @@ public Dataset createDataset(CreateDatasetRequest createDatasetRequest) {
8188
return dataset;
8289
}
8390

91+
private void addDatasetToGraph(Dataset dataset, CollectionTaskDetailResponse collection) {
92+
LineageNode datasetNode = new LineageNode();
93+
datasetNode.setId(dataset.getId());
94+
datasetNode.setNodeType(NodeType.DATASET);
95+
datasetNode.setName(dataset.getName());
96+
datasetNode.setDescription(dataset.getDescription());
97+
98+
LineageNode collectionNode = null;
99+
LineageEdge collectionEdge = null;
100+
if(Objects.nonNull(collection)) {
101+
collectionNode = new LineageNode();
102+
collectionNode.setId(collection.getId());
103+
collectionNode.setName(collection.getName());
104+
collectionNode.setDescription(collection.getDescription());
105+
collectionNode.setNodeType(NodeType.DATASOURCE);
106+
107+
collectionEdge = new LineageEdge();
108+
collectionEdge.setProcessId(collection.getId());
109+
collectionEdge.setName(collection.getName());
110+
collectionEdge.setEdgeType(EdgeType.DATA_COLLECTION);
111+
collectionEdge.setDescription(dataset.getDescription());
112+
collectionEdge.setFromNodeId(collectionNode.getId());
113+
collectionEdge.setToNodeId(datasetNode.getId());
114+
}
115+
lineageService.generateGraph(collectionNode, collectionEdge, datasetNode);
116+
}
117+
118+
public DatasetLineage getDatasetLineage(String datasetId) {
119+
Dataset dataset = datasetRepository.getById(datasetId);
120+
if (Objects.isNull(dataset)) {
121+
return new DatasetLineage();
122+
}
123+
LineageNode datasetNode = lineageService.getNodeById(datasetId);
124+
String graphId = datasetNode.getGraphId();
125+
return new DatasetLineage(lineageService.getNodesByGraphId(graphId), lineageService.getEdgesByGraphId(graphId));
126+
}
127+
84128
public String getDatasetPvcName() {
85129
return sysParamService.getParamByKey(DATASET_PVC_NAME);
86130
}
@@ -100,11 +144,11 @@ public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatase
100144
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
101145
dataset.setStatus(updateDatasetRequest.getStatus());
102146
}
147+
datasetRepository.updateById(dataset);
103148
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
104149
// 数据源id不为空,使用异步线程进行文件扫盘落库
105150
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
106151
}
107-
datasetRepository.updateById(dataset);
108152
return dataset;
109153
}
110154

@@ -261,6 +305,8 @@ private List<String> getFilePaths(String dataSourceId, Dataset dataset) {
261305
log.warn("Fail to get collection task detail, task ID: {}", dataSourceId);
262306
return Collections.emptyList();
263307
}
308+
// 记录血缘关系
309+
addDatasetToGraph(dataset, taskDetail);
264310
Path targetPath = Paths.get(taskDetail.getTargetPath());
265311
if (!Files.exists(targetPath) || !Files.isDirectory(targetPath)) {
266312
log.warn("Target path not exists or is not a directory: {}", taskDetail.getTargetPath());

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -255,33 +255,47 @@ public Resource downloadFile(String datasetId, String fileId) {
255255
}
256256

257257
/**
258-
* 下载文件
258+
* 下载数据集所有文件为 ZIP
259259
*/
260260
@Transactional(readOnly = true)
261261
public void downloadDatasetFileAsZip(String datasetId, HttpServletResponse response) {
262262
Dataset dataset = datasetRepository.getById(datasetId);
263263
if (Objects.isNull(dataset)) {
264264
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
265265
}
266-
List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
267-
Set<String> filePaths = allByDatasetId.stream().map(DatasetFile::getFilePath).collect(Collectors.toSet());
268266
String datasetPath = dataset.getPath();
269-
Path downloadPath = Path.of(datasetPath);
267+
Path downloadPath = Paths.get(datasetPath).normalize();
268+
269+
// 检查路径是否存在
270+
if (!Files.exists(downloadPath) || !Files.isDirectory(downloadPath)) {
271+
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
272+
}
273+
270274
response.setContentType("application/zip");
271-
String zipName = String.format("dataset_%s.zip",
275+
String zipName = String.format("dataset_%s_%s.zip",
276+
dataset.getName() != null ? dataset.getName().replaceAll("[^a-zA-Z0-9_-]", "_") : "dataset",
272277
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
273-
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
278+
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=\"" + zipName + "\"");
279+
274280
try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(response.getOutputStream())) {
275281
try (Stream<Path> pathStream = Files.walk(downloadPath)) {
276-
List<Path> allPaths = pathStream.filter(path -> path.toString().startsWith(datasetPath))
277-
.filter(path -> filePaths.stream().anyMatch(filePath -> filePath.startsWith(path.toString())))
278-
.toList();
279-
for (Path path : allPaths) {
280-
addToZipFile(path, downloadPath, zos);
281-
}
282+
pathStream
283+
.filter(path -> {
284+
// 确保路径在数据集目录内,防止路径遍历攻击
285+
Path normalized = path.normalize();
286+
return normalized.startsWith(downloadPath);
287+
})
288+
.forEach(path -> {
289+
try {
290+
addToZipFile(path, downloadPath, zos);
291+
} catch (IOException e) {
292+
log.error("Failed to add file to zip: {}", path, e);
293+
}
294+
});
282295
}
296+
zos.finish();
283297
} catch (IOException e) {
284-
log.error("Failed to download files in batches.", e);
298+
log.error("Failed to download dataset files as zip for dataset {}", datasetId, e);
285299
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
286300
}
287301
}
@@ -402,13 +416,23 @@ private void addFileToDataset(String datasetId, List<FileUploadResult> unpacked)
402416
for (FileUploadResult file : unpacked) {
403417
File savedFile = file.getSavedFile();
404418
LocalDateTime currentTime = LocalDateTime.now();
419+
// 统一 fileName:无论是否通过文件夹/压缩包上传,都只保留纯文件名
420+
String originalFileName = file.getFileName();
421+
String baseFileName = originalFileName;
422+
if (originalFileName != null) {
423+
String normalized = originalFileName.replace("\\", "/");
424+
int lastSlash = normalized.lastIndexOf('/');
425+
if (lastSlash >= 0 && lastSlash + 1 < normalized.length()) {
426+
baseFileName = normalized.substring(lastSlash + 1);
427+
}
428+
}
405429
DatasetFile datasetFile = DatasetFile.builder()
406430
.id(UUID.randomUUID().toString())
407431
.datasetId(datasetId)
408432
.fileSize(savedFile.length())
409433
.uploadTime(currentTime)
410434
.lastAccessTime(currentTime)
411-
.fileName(file.getFileName())
435+
.fileName(baseFileName)
412436
.filePath(savedFile.getPath())
413437
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
414438
.build();
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package com.datamate.datamanagement.interfaces.dto;
2+
3+
import com.datamate.common.domain.model.LineageEdge;
4+
import com.datamate.common.domain.model.LineageNode;
5+
import lombok.AllArgsConstructor;
6+
import lombok.Getter;
7+
import lombok.NoArgsConstructor;
8+
import lombok.Setter;
9+
10+
import java.util.List;
11+
12+
/**
13+
* 数据集血缘
14+
*
15+
* @since 2026/1/23
16+
*/
17+
@Getter
18+
@Setter
19+
@NoArgsConstructor
20+
@AllArgsConstructor
21+
public class DatasetLineage {
22+
/**
23+
* 节点列表
24+
*/
25+
private List<LineageNode> lineageNodes;
26+
/**
27+
* 边列表
28+
*/
29+
private List<LineageEdge> lineageEdges;
30+
}

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetController.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ public DatasetResponse updateDataset(@PathVariable("datasetId") String datasetId
8080
return DatasetConverter.INSTANCE.convertToResponse(dataset);
8181
}
8282

83+
@GetMapping("/{datasetId}/lineage")
84+
public DatasetLineage getDatasetLineage(@PathVariable("datasetId") String datasetId) {
85+
return datasetApplicationService.getDatasetLineage(datasetId);
86+
}
87+
8388
/**
8489
* 根据ID删除数据集
8590
*
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.datamate.common.domain.enums;
2+
3+
/**
4+
* 边类型:DATA_COLLECTION/DATA_CLEANING/DATA_LABELING/DATA_SYNTHESIS/DATA_RATIO
5+
*
6+
* @since 2026/1/23
7+
*/
8+
public enum EdgeType {
9+
DATA_COLLECTION,
10+
DATA_CLEANING,
11+
DATA_LABELING,
12+
DATA_SYNTHESIS,
13+
DATA_RATIO
14+
}

0 commit comments

Comments
 (0)