Skip to content

Commit 6b7e280

Browse files
authored
feat(text-annotation): improve manual text annotation and enable visibility in Label Studio (#235)
- Optimize manual text annotation workflow - Make text content visible and editable within Label Studio
1 parent a9587b9 commit 6b7e280

5 files changed

Lines changed: 43 additions & 23 deletions

File tree

frontend/src/components/business/DatasetFileTransfer.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
5151
selectedFilesMap,
5252
onSelectedFilesChange,
5353
onDatasetSelect,
54-
datasetTypeFilter = DatasetType.TEXT,
54+
datasetTypeFilter,
5555
...props
5656
}) => {
5757
const [datasets, setDatasets] = React.useState<Dataset[]>([]);
@@ -85,6 +85,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
8585
page: datasetPagination.current,
8686
size: datasetPagination.pageSize,
8787
keyword: datasetSearch,
88+
// 仅在显式传入过滤类型时才按类型过滤;否则后端返回所有类型
8889
type: datasetTypeFilter,
8990
});
9091
setDatasets(data.content.map(mapDataset) || []);

frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -183,16 +183,8 @@ export default function CreateAnnotationTask({
183183
// 手动标注也支持跨数据集、精确到文件的选择
184184
const selectedFiles = Object.values(selectedFilesMap) as any[];
185185

186-
const imageExtensions = [".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"];
187-
const imageFileIds = selectedFiles
188-
.filter((file) => {
189-
const ext = file.fileName?.toLowerCase().match(/\.[^.]+$/)?.[0] || "";
190-
return imageExtensions.includes(ext);
191-
})
192-
.map((file) => file.id);
193-
194-
if (imageFileIds.length === 0) {
195-
message?.error?.("请至少选择一个图像文件");
186+
if (selectedFiles.length === 0) {
187+
message?.error?.("请至少选择一个文件");
196188
setSubmitting(false);
197189
return;
198190
}
@@ -213,7 +205,7 @@ export default function CreateAnnotationTask({
213205
description: values.description,
214206
datasetId: effectiveDatasetId,
215207
templateId: values.templateId,
216-
fileIds: imageFileIds,
208+
fileIds: selectedFiles.map((file) => file.id),
217209
};
218210

219211
await createAnnotationTaskUsingPost(requestData);
@@ -326,8 +318,8 @@ export default function CreateAnnotationTask({
326318
label: "手动标注",
327319
children: (
328320
<Form form={manualForm} layout="vertical">
329-
{/* 选择数据集和图像文件(支持多数据集、多文件) */}
330-
<Form.Item label="选择数据集和图像文件" required>
321+
{/* 选择数据集和文件(支持多数据集、多文件) */}
322+
<Form.Item label="选择数据集和文件" required>
331323
<DatasetFileTransfer
332324
open
333325
selectedFilesMap={selectedFilesMap}
@@ -346,12 +338,11 @@ export default function CreateAnnotationTask({
346338
manualForm.setFieldsValue({ name: defaultName });
347339
}
348340
}}
349-
datasetTypeFilter={DatasetType.IMAGE}
350341
/>
351342
{selectedDataset && (
352343
<div className="mt-2 p-2 bg-blue-50 rounded border border-blue-200 text-xs">
353344
当前数据集:<span className="font-medium">{selectedDataset.name}</span> - 已选择
354-
<span className="font-medium text-blue-600"> {imageFileCount} </span>个图像文件
345+
<span className="font-medium text-blue-600"> {Object.keys(selectedFilesMap).length} </span>个文件
355346
</div>
356347
)}
357348
</Form.Item>

frontend/src/pages/KnowledgeBase/components/AddDataDialog.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ export default function AddDataDialog({ knowledgeBase, onDataAdded }) {
273273
open={open}
274274
selectedFilesMap={selectedFilesMap}
275275
onSelectedFilesChange={setSelectedFilesMap}
276+
datasetTypeFilter={DatasetType.TEXT}
276277
/>
277278
)}
278279

frontend/src/pages/SynthesisTask/CreateTask.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { useEffect, useState } from "react";
22
import type { Dataset, DatasetFile } from "@/pages/DataManagement/dataset.model";
3+
import { DatasetType } from "@/pages/DataManagement/dataset.model";
34
import { Steps, Card, Select, Input, Button, Form, message, Tag, Tooltip, InputNumber } from "antd";
45
import { Eye, ArrowLeft, ArrowRight, Play, Search, Sparkles, Brain, Layers } from "lucide-react";
56
import { Link, useNavigate } from "react-router";
@@ -304,7 +305,7 @@ export default function SynthesisTaskCreate() {
304305
<DatasetFileTransfer open selectedFilesMap={selectedMap} onSelectedFilesChange={setSelectedMap} onDatasetSelect={(dataset) => {
305306
setSelectedDataset(dataset);
306307
form.setFieldsValue({ sourceDataset: dataset?.id ?? "" });
307-
}} />
308+
}} datasetTypeFilter={DatasetType.TEXT} />
308309
{selectedDataset && (
309310
<div className="mt-4 p-3 bg-gray-50 rounded border text-xs text-gray-600">
310311
当前数据集:<span className="font-medium text-gray-900">{selectedDataset.name}</span>

runtime/datamate-python/app/module/annotation/service/sync.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from typing import Optional, List, Dict, Any, Tuple, Set
2+
import os
3+
24
from app.module.dataset import DatasetManagementService
35
from sqlalchemy import update, select
46
from app.db.models import DatasetFiles
@@ -52,14 +54,38 @@ def _determine_data_type(self, file_type: str) -> str:
5254
def _build_task_data(self, file_info: Any, dataset_id: str) -> dict:
5355
"""构建Label Studio任务数据"""
5456
data_type = self._determine_data_type(file_info.fileType)
55-
56-
# 替换文件路径前缀
57-
file_path = file_info.filePath.removeprefix(settings.dm_file_path_prefix)
58-
file_path = settings.label_studio_file_path_prefix + file_path
59-
57+
58+
# 默认仍然走 Label Studio 本地文件 URL
59+
# 先替换文件路径前缀,构造 /data/local-files/?d=/... 形式
60+
relative_path = file_info.filePath.removeprefix(settings.dm_file_path_prefix)
61+
ls_file_url = settings.label_studio_file_path_prefix + relative_path
62+
63+
data_value: Any = ls_file_url
64+
65+
# 对于纯文本文件(例如 .txt),支持直接把文件内容写入到 data.text,
66+
# 这样在 Label Studio 里会直接显示文本内容,而不是 URL。
67+
if data_type == "text":
68+
try:
69+
_, ext = os.path.splitext(file_info.filePath)
70+
ext = ext.lower()
71+
72+
# 目前只对 .txt 做内联,其他如 pdf/doc 仍然使用 URL
73+
if ext == ".txt":
74+
with open(file_info.filePath, "r", encoding="utf-8", errors="ignore") as f:
75+
content = f.read()
76+
if content:
77+
data_value = content
78+
except Exception as e:
79+
# 读取失败时退回到原来的 URL 形式,避免中断同步流程
80+
logger.warning(
81+
"Failed to inline text content for file %s: %s",
82+
getattr(file_info, "filePath", "<unknown>"),
83+
str(e),
84+
)
85+
6086
return {
6187
"data": {
62-
f"{data_type}": file_path,
88+
f"{data_type}": data_value,
6389
"file_path": file_info.filePath,
6490
"file_id": file_info.id,
6591
"original_name": file_info.originalName,

0 commit comments

Comments
 (0)