Skip to content

Commit 65c86f4

Browse files
committed
Merge branch 'main' into feat/clean
# Conflicts: # frontend/src/pages/DataCleansing/Detail/components/LogsTable.tsx
2 parents 2cc6591 + 638f5ba commit 65c86f4

38 files changed

Lines changed: 2915 additions & 2302 deletions

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ public void processDataSourceAsync(String datasetId, String dataSourceId) {
285285
if (CollectionUtils.isEmpty(filePaths)) {
286286
return;
287287
}
288-
datasetFileApplicationService.copyFilesToDatasetDir(datasetId, new CopyFilesRequest(filePaths));
288+
datasetFileApplicationService.addFilesToDataset(datasetId, new AddFilesRequest(filePaths));
289289
log.info("Success file scan, total files: {}", filePaths.size());
290290
} catch (Exception e) {
291291
log.error("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java

Lines changed: 69 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
2525
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
2626
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
27-
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
2827
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
2928
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
3029
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
@@ -58,7 +57,6 @@
5857
import java.time.ZoneId;
5958
import java.time.format.DateTimeFormatter;
6059
import java.util.*;
61-
import java.util.concurrent.CompletableFuture;
6260
import java.util.function.Function;
6361
import java.util.stream.Collectors;
6462
import java.util.stream.Stream;
@@ -664,11 +662,9 @@ public void renameFile(String datasetId, String fileId, RenameFileRequest reques
664662
}
665663

666664
String originalFileName = file.getFileName();
667-
String baseName = originalFileName;
668665
String extension = "";
669666
int dotIndex = originalFileName.lastIndexOf('.');
670667
if (dotIndex > 0 && dotIndex < originalFileName.length() - 1) {
671-
baseName = originalFileName.substring(0, dotIndex);
672668
extension = originalFileName.substring(dotIndex); // 包含点号,如 .jpg
673669
}
674670

@@ -828,114 +824,102 @@ private void setDatasetFileId(DatasetFile datasetFile, Dataset dataset) {
828824
}
829825

830826
/**
831-
* 复制文件到数据集目录
827+
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
832828
*
833829
* @param datasetId 数据集id
834-
* @param req 复制文件请求
835-
* @return 复制的文件列表
830+
* @param req 添加文件请求
831+
* @return 添加的文件列表
836832
*/
837833
@Transactional
838-
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
834+
public List<DatasetFile> addFilesToDataset(String datasetId, AddFilesRequest req) {
835+
if (!req.isValidPrefix()) {
836+
throw BusinessException.of(DataManagementErrorCode.DIRECTORY_NOT_FOUND);
837+
}
839838
Dataset dataset = datasetRepository.getById(datasetId);
840839
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
841-
List<DatasetFile> copiedFiles = new ArrayList<>();
840+
List<DatasetFile> addedFiles = new ArrayList<>();
842841
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
843842
dataset.setFiles(existDatasetFiles);
844-
for (String sourceFilePath : req.sourcePaths()) {
845-
Path sourcePath = Paths.get(sourceFilePath);
846-
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
847-
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
848-
continue;
843+
try {
844+
ObjectMapper objectMapper = new ObjectMapper();
845+
846+
for (AddFilesRequest.FileRequest file : req.getFiles()) {
847+
DatasetFile datasetFile = getDatasetFileForAdd(req, file, dataset, objectMapper);
848+
setDatasetFileId(datasetFile, dataset);
849+
dataset.addFile(datasetFile);
850+
addedFiles.add(datasetFile);
851+
addFile(file.getFilePath(), datasetFile.getFilePath(), req.isSoftAdd());
849852
}
850-
String fileName = sourcePath.getFileName().toString();
851-
File sourceFile = sourcePath.toFile();
852-
LocalDateTime currentTime = LocalDateTime.now();
853-
DatasetFile datasetFile = DatasetFile.builder()
854-
.id(UUID.randomUUID().toString())
855-
.datasetId(datasetId)
856-
.fileName(fileName)
857-
.fileType(AnalyzerUtils.getExtension(fileName))
858-
.fileSize(sourceFile.length())
859-
.filePath(Paths.get(dataset.getPath(), fileName).toString())
860-
.uploadTime(currentTime)
861-
.lastAccessTime(currentTime)
862-
.build();
863-
setDatasetFileId(datasetFile, dataset);
864-
dataset.addFile(datasetFile);
865-
copiedFiles.add(datasetFile);
853+
} catch (BusinessException e) {
854+
throw e;
855+
} catch (Exception e) {
856+
log.error("Failed to add file to dataset {}", dataset.getName(), e);
857+
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
866858
}
867-
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
859+
860+
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
868861
dataset.active();
869862
datasetRepository.updateById(dataset);
870-
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
871-
return copiedFiles;
863+
return addedFiles;
872864
}
873865

874-
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
875-
for (String sourcePath : sourcePaths) {
876-
Path sourceFilePath = Paths.get(sourcePath);
877-
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
878-
try {
879-
Files.createDirectories(Path.of(dataset.getPath()));
880-
Files.copy(sourceFilePath, targetFilePath);
881-
} catch (IOException e) {
882-
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
883-
}
866+
private void addFile(String sourPath, String targetPath, boolean softAdd) {
867+
if (StringUtils.isBlank(sourPath) || StringUtils.isBlank(targetPath)) {
868+
return;
884869
}
885-
}
870+
Path source = Paths.get(sourPath).normalize();
871+
Path target = Paths.get(targetPath).normalize();
886872

887-
/**
888-
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
889-
*
890-
* @param datasetId 数据集id
891-
* @param req 添加文件请求
892-
* @return 添加的文件列表
893-
*/
894-
@Transactional
895-
public List<DatasetFile> addFilesToDataset(String datasetId, AddFilesRequest req) {
896-
Dataset dataset = datasetRepository.getById(datasetId);
897-
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
898-
List<DatasetFile> addedFiles = new ArrayList<>();
899-
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
900-
dataset.setFiles(existDatasetFiles);
873+
// 检查源文件是否存在且为普通文件
874+
if (!Files.exists(source) || !Files.isRegularFile(source)) {
875+
log.warn("Source file does not exist or is not a regular file: {}", sourPath);
876+
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
877+
}
901878

902-
boolean softAdd = req.softAdd();
903-
String metadata;
904879
try {
905-
Map<String, Boolean> metadataMap = Map.of("softAdd", softAdd);
906-
ObjectMapper objectMapper = new ObjectMapper();
907-
metadata = objectMapper.writeValueAsString(metadataMap);
908-
} catch (JsonProcessingException e) {
909-
log.error("Failed to serialize metadataMap", e);
910-
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
880+
Path parent = target.getParent();
881+
// 创建目标目录(如果需要)
882+
if (parent != null) {
883+
Files.createDirectories(parent);
884+
}
885+
Files.deleteIfExists(target);
886+
if (softAdd) {
887+
// 优先尝试创建硬链接,失败后尝试创建符号链接;若均失败抛出异常
888+
try {
889+
Files.createLink(target, source);
890+
return;
891+
} catch (Throwable hardEx) {
892+
log.warn("create hard link failed from {} to {}: {}", source, target, hardEx.getMessage());
893+
}
894+
Files.createSymbolicLink(target, source);
895+
} else {
896+
// 覆盖已存在的目标文件,保持与其他地方行为一致
897+
Files.copy(source, target);
898+
}
899+
} catch (IOException e) {
900+
log.error("Failed to add file from {} to {}", source, target, e);
901+
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
911902
}
903+
}
912904

913-
for (String sourceFilePath : req.sourcePaths()) {
914-
Path sourcePath = Paths.get(sourceFilePath);
915-
String fileName = sourcePath.getFileName().toString();
916-
File sourceFile = sourcePath.toFile();
917-
LocalDateTime currentTime = LocalDateTime.now();
905+
private static DatasetFile getDatasetFileForAdd(AddFilesRequest req, AddFilesRequest.FileRequest file,
906+
Dataset dataset, ObjectMapper objectMapper) throws JsonProcessingException {
907+
Path sourcePath = Paths.get(file.getFilePath());
908+
File sourceFile = sourcePath.toFile();
909+
file.getMetadata().put("softAdd", req.isSoftAdd());
910+
LocalDateTime currentTime = LocalDateTime.now();
911+
String fileName = sourcePath.getFileName().toString();
918912

919-
DatasetFile datasetFile = DatasetFile.builder()
913+
return DatasetFile.builder()
920914
.id(UUID.randomUUID().toString())
921-
.datasetId(datasetId)
915+
.datasetId(dataset.getId())
922916
.fileName(fileName)
923917
.fileType(AnalyzerUtils.getExtension(fileName))
924918
.fileSize(sourceFile.length())
925-
.filePath(sourceFilePath)
919+
.filePath(Paths.get(dataset.getPath(), req.getPrefix(), fileName).toString())
926920
.uploadTime(currentTime)
927921
.lastAccessTime(currentTime)
928-
.metadata(metadata)
922+
.metadata(objectMapper.writeValueAsString(file.getMetadata()))
929923
.build();
930-
setDatasetFileId(datasetFile, dataset);
931-
dataset.addFile(datasetFile);
932-
addedFiles.add(datasetFile);
933-
}
934-
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
935-
dataset.active();
936-
datasetRepository.updateById(dataset);
937-
// Note: addFilesToDataset only creates DB records, no file system operations
938-
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
939-
return addedFiles;
940924
}
941925
}
Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,51 @@
11
package com.datamate.datamanagement.interfaces.dto;
22

3-
import jakarta.validation.constraints.NotEmpty;
4-
import jakarta.validation.constraints.NotNull;
3+
import lombok.AllArgsConstructor;
4+
import lombok.Getter;
5+
import lombok.NoArgsConstructor;
6+
import lombok.Setter;
7+
import org.apache.commons.collections4.CollectionUtils;
8+
import org.apache.commons.lang3.StringUtils;
59

610
import java.util.List;
11+
import java.util.Map;
712

813
/**
9-
* 添加文件请求DTO(仅创建DB记录,不执行文件系统操作)
14+
* AddFilesRequest1
1015
*
11-
* @author datamate
12-
* @since 2025-11-29
16+
* @since 2026/2/9
1317
*/
14-
public record AddFilesRequest(
15-
@NotEmpty List<String> sourcePaths,
16-
@NotNull Boolean softAdd
17-
) {
18+
@Getter
19+
@Setter
20+
@NoArgsConstructor
21+
@AllArgsConstructor
22+
public class AddFilesRequest {
23+
public AddFilesRequest(List<String> paths) {
24+
if (CollectionUtils.isEmpty(paths)) {
25+
return;
26+
}
27+
this.files = paths.stream().map(path -> {
28+
AddFilesRequest.FileRequest file = new AddFilesRequest.FileRequest();
29+
file.setFilePath(path);
30+
return file;
31+
}).toList();
32+
}
33+
34+
@Getter
35+
@Setter
36+
public static class FileRequest {
37+
private String filePath;
38+
39+
private Map<String, Object> metadata;
40+
}
41+
42+
private boolean softAdd;
43+
44+
private String prefix = "";
45+
46+
private List<FileRequest> files;
47+
48+
public boolean isValidPrefix() {
49+
return StringUtils.isEmpty(prefix) || (!prefix.startsWith("."));
50+
}
1851
}

backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import jakarta.validation.Valid;
2323
import lombok.RequiredArgsConstructor;
2424
import lombok.extern.slf4j.Slf4j;
25-
import org.springframework.beans.factory.annotation.Autowired;
2625
import org.springframework.core.io.Resource;
2726
import org.springframework.http.HttpHeaders;
2827
import org.springframework.http.HttpStatus;
@@ -157,7 +156,8 @@ public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datase
157156
@PostMapping("/upload/copy")
158157
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
159158
@RequestBody @Valid CopyFilesRequest req) {
160-
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
159+
AddFilesRequest addFilesRequest = new AddFilesRequest(req.sourcePaths());
160+
List<DatasetFile> datasetFiles = datasetFileApplicationService.addFilesToDataset(datasetId, addFilesRequest);
161161
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
162162
}
163163

deployment/docker/datamate/docker-compose.yml

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,26 @@ services:
201201
# ==============================
202202
# Optional: Label Studio (profile: label-studio)
203203
# ==============================
204+
label-studio-pgbouncer:
205+
container_name: label-studio-pgbouncer
206+
image: pgbouncer/pgbouncer:latest
207+
restart: on-failure
208+
ports:
209+
- "6432:6432"
210+
environment:
211+
- DATABASES_HOST=datamate-database
212+
- DATABASES_PORT=5432
213+
- DATABASES_NAME=labelstudio
214+
- DATABASES_USER=postgres
215+
- DATABASES_PASSWORD=${DB_PASSWORD:-password}
216+
- POOL_MODE=transaction
217+
- MAX_CLIENT_CONN=100
218+
- DEFAULT_POOL_SIZE=20
219+
- MAX_DB_CONNECTIONS=20
220+
networks:
221+
- datamate
222+
profiles: [ label-studio ]
223+
204224
label-studio:
205225
container_name: label-studio
206226
stdin_open: true
@@ -214,14 +234,14 @@ services:
214234
ports:
215235
- "30001:8000"
216236
depends_on:
217-
- datamate-database
237+
- label-studio-pgbouncer
218238
environment:
219239
- DJANGO_DB=default
220240
- POSTGRE_NAME=labelstudio
221241
- POSTGRE_USER=postgres
222242
- POSTGRE_PASSWORD=${DB_PASSWORD:-password}
223-
- POSTGRE_PORT=5432
224-
- POSTGRE_HOST=datamate-database
243+
- POSTGRE_PORT=6432
244+
- POSTGRE_HOST=label-studio-pgbouncer
225245
- LABEL_STUDIO_HOST=${LABEL_STUDIO_HOST:-}
226246
- LOCAL_FILES_SERVING_ENABLED=true
227247
- LOCAL_FILES_DOCUMENT_ROOT=/label-studio/local

deployment/helm/label-studio/templates/deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ spec:
3838
- name: POSTGRE_PASSWORD
3939
value: {{ .Values.env.POSTGRE_PASSWORD | quote }}
4040
- name: POSTGRE_PORT
41-
value: {{ .Values.env.POSTGRE_PORT | quote }}
41+
value: {{ if .Values.pgbouncer.enabled }}{{ "6432" | quote }}{{ else }}{{ .Values.env.POSTGRE_PORT | quote }}{{ end }}
4242
- name: POSTGRE_HOST
43-
value: {{ .Values.env.POSTGRE_HOST | quote }}
43+
value: {{ if .Values.pgbouncer.enabled }}{{ printf "%s-pgbouncer" (include "label-studio.fullname" .) | quote }}{{ else }}{{ .Values.env.POSTGRE_HOST | quote }}{{ end }}
4444
- name: LABEL_STUDIO_HOST
4545
value: {{ .Values.env.LABEL_STUDIO_HOST | quote }}
4646
- name: LOCAL_FILES_SERVING_ENABLED

0 commit comments

Comments
 (0)