Skip to content

Commit 41a5b6c

Browse files
committed
feat: add dataset retrieval functionality and update cleaning task creation logic; enhance Dockerfile for timezone configuration
1 parent ee4e2ac commit 41a5b6c

4 files changed

Lines changed: 52 additions & 36 deletions

File tree

backend/services/data-cleaning-service/src/main/java/com/datameta/cleaning/application/httpclient/DatasetClient.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public class DatasetClient {
2828

2929
private static final String CREATE_DATASET_URL = BASE_URL + "/data-management/datasets";
3030

31+
private static final String GET_DATASET_URL = BASE_URL + "/data-management/datasets/{0}";
32+
3133
private static final String GET_DATASET_FILE_URL = BASE_URL + "/data-management/datasets/{0}/files";
3234

3335
private static final HttpClient CLIENT = HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build();
@@ -61,6 +63,17 @@ public static DatasetResponse createDataset(String name, String type) {
6163
return sendAndReturn(request, DatasetResponse.class);
6264
}
6365

66+
public static DatasetResponse getDataset(String datasetId) {
67+
HttpRequest request = HttpRequest.newBuilder()
68+
.uri(URI.create(MessageFormat.format(GET_DATASET_URL, datasetId)))
69+
.timeout(Duration.ofSeconds(30))
70+
.header("Content-Type", "application/json")
71+
.GET()
72+
.build();
73+
74+
return sendAndReturn(request, DatasetResponse.class);
75+
}
76+
6477
public static PagedDatasetFileResponse getDatasetFile(String datasetId, PageRequest page) {
6578
String url = buildQueryParams(MessageFormat.format(GET_DATASET_FILE_URL, datasetId),
6679
Map.of("page", page.getPageNumber(), "size", page.getPageSize()));

backend/services/data-cleaning-service/src/main/java/com/datameta/cleaning/application/service/CleaningTaskService.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,11 @@ public int countTasks(String status, String keywords) {
6464

6565
@Transactional
6666
public CleaningTask createTask(CreateCleaningTaskRequest request) {
67-
DatasetResponse datasetResponse = DatasetClient.createDataset(request.getDestDatasetName(),
67+
DatasetResponse destDataset = DatasetClient.createDataset(request.getDestDatasetName(),
6868
request.getDestDatasetType());
6969

70+
DatasetResponse srcDataset = DatasetClient.getDataset(request.getSrcDatasetId());
71+
7072
CleaningTask task = new CleaningTask();
7173
task.setName(request.getName());
7274
task.setDescription(request.getDescription());
@@ -75,9 +77,9 @@ public CleaningTask createTask(CreateCleaningTaskRequest request) {
7577
task.setId(taskId);
7678
task.setSrcDatasetId(request.getSrcDatasetId());
7779
task.setSrcDatasetName(request.getSrcDatasetName());
78-
task.setDestDatasetId(datasetResponse.getId());
79-
task.setDestDatasetName(datasetResponse.getName());
80-
task.setBeforeSize(datasetResponse.getTotalSize());
80+
task.setDestDatasetId(destDataset.getId());
81+
task.setDestDatasetName(destDataset.getName());
82+
task.setBeforeSize(srcDataset.getTotalSize());
8183
cleaningTaskMapper.insertTask(task);
8284

8385
List<OperatorInstancePo> instancePos = request.getInstance().stream()

scripts/db/data-cleaning-init.sql

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -59,33 +59,33 @@ VALUES ('ac2f2582-a990-11f0-9768-00155d09c825', '空模板', '空模板'),
5959
('26ae585c-8310-4679-adc0-e53215e6e69b', 'text文本清洗模板', 'text文本清洗模板');
6060

6161
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
62-
VALUES ('fb6d0d76-a990-11f0-92db-00155d09c825', 'TextFormatter', 1, '{}'),
63-
('fb6d0d76-a990-11f0-92db-00155d09c825', 'FileExporter', 2, '{}'),
64-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, '{}'),
65-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, '{}'),
66-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, '{}'),
67-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, '{}'),
68-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, '{}'),
69-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, '{}'),
70-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, '{}'),
71-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, '{}'),
72-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, '{}'),
73-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, '{}'),
74-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, '{}'),
75-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, '{}'),
76-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, '{}'),
77-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, '{}'),
78-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, '{}'),
79-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, '{}'),
80-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, '{}'),
81-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, '{}'),
82-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, '{}'),
83-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, '{}'),
84-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, '{}'),
85-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, '{}'),
86-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, '{}'),
87-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, '{}'),
88-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, '{}'),
89-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, '{}'),
90-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, '{}'),
91-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, '{}');
62+
VALUES ('fb6d0d76-a990-11f0-92db-00155d09c825', 'TextFormatter', 1, null),
63+
('fb6d0d76-a990-11f0-92db-00155d09c825', 'FileExporter', 2, null),
64+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
65+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
66+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
67+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
68+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
69+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
70+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
71+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
72+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
73+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
74+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
75+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
76+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
77+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
78+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
79+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
80+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
81+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
82+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
83+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
84+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
85+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
86+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
87+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
88+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
89+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
90+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
91+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null);

scripts/images/backend/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ COPY editions/community/config/application.yml /opt/backend/application.yml
3737
COPY editions/community/config/log4j2.xml /opt/backend/log4j2.xml
3838
COPY scripts/images/backend/start.sh /opt/backend/start.sh
3939

40-
RUN chmod +x /opt/backend/start.sh
40+
RUN chmod +x /opt/backend/start.sh \
41+
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
4142

4243
ENTRYPOINT ["/opt/backend/start.sh"]
4344

44-
CMD ["java", "-jar", "/opt/backend/data-meta.jar"]
45+
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/backend/data-meta.jar"]

0 commit comments

Comments
 (0)