Skip to content

Commit 7e26aaa

Browse files
committed
🔒 mask sensitive data
1 parent 93b659f commit 7e26aaa

5 files changed

Lines changed: 197 additions & 59 deletions

File tree

.gitignore

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,4 +191,11 @@ Thumbs.db
191191
# Milvus
192192
**/volumes/
193193

194-
**/rag_storage/
194+
**/rag_storage/
195+
196+
# AI tools/agent
197+
.agents/
198+
.claude/
199+
data/
200+
openspec/
201+
devspace.yaml
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Security utilities for handling sensitive data"""
2+
import re
3+
from typing import Any, Dict
4+
5+
6+
SENSITIVE_FIELDS = [
7+
"secretKey",
8+
"accessKey",
9+
"password",
10+
"passwd",
11+
"pwd",
12+
"secret",
13+
"token",
14+
"apiKey",
15+
"api_key",
16+
"access_key",
17+
"secret_key",
18+
]
19+
20+
MASK_PATTERN = "**********"
21+
22+
23+
def mask_sensitive_value(value: str) -> str:
24+
"""Mask a single sensitive value"""
25+
return MASK_PATTERN
26+
27+
28+
def is_masked_value(value: str) -> bool:
29+
"""Check if a value is already masked"""
30+
return value == MASK_PATTERN
31+
32+
33+
def preserve_sensitive_values(
34+
new_config: Dict[str, Any],
35+
original_config: Dict[str, Any]
36+
) -> Dict[str, Any]:
37+
"""
38+
Preserve original sensitive values if masked pattern is detected in update request.
39+
40+
When frontend receives masked values and sends them back in update request,
41+
this function detects masked values and replaces them with original values
42+
from database.
43+
44+
Args:
45+
new_config: Config from update request (may contain masked values)
46+
original_config: Original config from database (contains real values)
47+
48+
Returns:
49+
Config with original sensitive values preserved
50+
"""
51+
if not isinstance(new_config, dict) or not isinstance(original_config, dict):
52+
return new_config
53+
54+
preserved_config = {}
55+
for key, new_value in new_config.items():
56+
# Check if key is a sensitive field
57+
key_lower = key.lower()
58+
is_sensitive = any(
59+
field.lower() == key_lower or field.lower() in key_lower
60+
for field in SENSITIVE_FIELDS
61+
)
62+
63+
# If value is masked and field is sensitive, use original value
64+
if is_sensitive and isinstance(new_value, str) and is_masked_value(new_value):
65+
original_value = original_config.get(key)
66+
preserved_config[key] = original_value if original_value else new_value
67+
elif isinstance(new_value, dict):
68+
# Recursively process nested dictionaries
69+
original_nested = original_config.get(key, {})
70+
preserved_config[key] = preserve_sensitive_values(new_value, original_nested)
71+
elif isinstance(new_value, list):
72+
# Process list items (preserve original if matching structure)
73+
original_list = original_config.get(key, [])
74+
preserved_config[key] = [
75+
preserve_sensitive_values(new_item, orig_item)
76+
if isinstance(new_item, dict) and isinstance(orig_item, dict)
77+
else new_item
78+
for new_item, orig_item in zip(new_value, original_list)
79+
] if len(new_value) == len(original_list) else new_value
80+
else:
81+
# Keep non-sensitive/non-masked values
82+
preserved_config[key] = new_value
83+
84+
return preserved_config
85+
86+
87+
def mask_sensitive_dict(data: Dict[str, Any]) -> Dict[str, Any]:
88+
"""
89+
Recursively mask sensitive fields in a dictionary.
90+
91+
Args:
92+
data: Dictionary that may contain sensitive fields
93+
94+
Returns:
95+
Dictionary with sensitive values masked
96+
"""
97+
if not isinstance(data, dict):
98+
return data
99+
100+
masked_data = {}
101+
for key, value in data.items():
102+
# Check if the key is a sensitive field (case-insensitive)
103+
key_lower = key.lower()
104+
is_sensitive = any(
105+
field.lower() == key_lower or field.lower() in key_lower
106+
for field in SENSITIVE_FIELDS
107+
)
108+
109+
if is_sensitive and isinstance(value, str):
110+
# Mask the sensitive value
111+
masked_data[key] = MASK_PATTERN
112+
elif isinstance(value, dict):
113+
# Recursively mask nested dictionaries
114+
masked_data[key] = mask_sensitive_dict(value)
115+
elif isinstance(value, list):
116+
# Process list items
117+
masked_data[key] = [
118+
mask_sensitive_dict(item) if isinstance(item, dict) else item
119+
for item in value
120+
]
121+
else:
122+
# Keep non-sensitive values as-is
123+
masked_data[key] = value
124+
125+
return masked_data
126+
127+
128+
def mask_sensitive_info(text: str) -> str:
129+
"""
130+
Mask sensitive information in text by replacing values with **********
131+
132+
Args:
133+
text: Original text that may contain sensitive information
134+
135+
Returns:
136+
Text with sensitive values masked
137+
"""
138+
masked_text = text
139+
140+
for field in SENSITIVE_FIELDS:
141+
patterns = [
142+
rf'"{field}"\s*:\s*"[^"]*"',
143+
rf'"{field}"\s*:\s*"[^"]*"',
144+
rf'{field}\s*=\s*[^\s,\]]+',
145+
rf"'{field}'\s*:\s*'[^']*'",
146+
]
147+
148+
for pattern in patterns:
149+
if '"' in pattern:
150+
masked_text = re.sub(pattern, f'"{field}": "{MASK_PATTERN}"', masked_text)
151+
elif "'" in pattern:
152+
masked_text = re.sub(pattern, f"'{field}': '{MASK_PATTERN}'", masked_text)
153+
else:
154+
masked_text = re.sub(pattern, f'{field}={MASK_PATTERN}', masked_text)
155+
156+
return masked_text

runtime/datamate-python/app/module/collection/client/datax_client.py

Lines changed: 1 addition & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,18 @@
11
import json
2-
import re
32
import threading
43
import subprocess
54
from datetime import datetime
65
from pathlib import Path
76
from typing import Dict, Any
87

98
from app.core.logging import get_logger
9+
from app.core.security import mask_sensitive_info
1010
from app.db.models.data_collection import CollectionTask, TaskExecution, CollectionTemplate
1111
from app.module.collection.schema.collection import CollectionConfig, SyncMode
1212
from app.module.shared.schema import TaskStatus
1313

1414
logger = get_logger(__name__)
1515

16-
# Sensitive fields that need to be masked in logs
17-
SENSITIVE_FIELDS = [
18-
"secretKey",
19-
"accessKey",
20-
"password",
21-
"passwd",
22-
"pwd",
23-
"secret",
24-
"token",
25-
"apiKey",
26-
"api_key",
27-
]
28-
29-
MASK_PATTERN = "**********"
30-
31-
def mask_sensitive_info(text: str) -> str:
32-
"""
33-
Mask sensitive information in text by replacing values with **********
34-
35-
Args:
36-
text: Original text that may contain sensitive information
37-
38-
Returns:
39-
Text with sensitive values masked
40-
"""
41-
masked_text = text
42-
43-
for field in SENSITIVE_FIELDS:
44-
# Match patterns like: "secretKey": "actual_value" or secretKey=actual_value
45-
patterns = [
46-
# JSON format: "field": "value"
47-
rf'"{field}"\s*:\s*"[^"]*"',
48-
rf'"{field}"\s*:\s*"[^"]*"',
49-
# Key-value format: field=value
50-
rf'{field}\s*=\s*[^\s,\]]+',
51-
# Quoted format: 'field': 'value'
52-
rf"'{field}'\s*:\s*'[^']*'",
53-
]
54-
55-
for pattern in patterns:
56-
# Replace the value part while keeping the field name
57-
if '"' in pattern:
58-
masked_text = re.sub(pattern, f'"{field}": "{MASK_PATTERN}"', masked_text)
59-
elif "'" in pattern:
60-
masked_text = re.sub(pattern, f"'{field}': '{MASK_PATTERN}'", masked_text)
61-
else:
62-
masked_text = re.sub(pattern, f'{field}={MASK_PATTERN}', masked_text)
63-
64-
return masked_text
65-
6616
class DataxClient:
6717
def __init__(self, task: CollectionTask, execution: TaskExecution, template: CollectionTemplate):
6818
self.execution = execution

runtime/datamate-python/app/module/collection/interface/collection.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212

1313
from app.core.exception import ErrorCodes, BusinessError, SuccessResponse, transaction
1414
from app.core.logging import get_logger
15+
from app.core.security import preserve_sensitive_values
1516
from app.db.models import Dataset, DatasetFiles
1617
from app.db.models.data_collection import CollectionTask, TaskExecution, CollectionTemplate
1718
from app.db.session import get_db
1819
from app.module.collection.client.datax_client import DataxClient
19-
from app.module.collection.schema.collection import CollectionTaskBase, CollectionTaskCreate, CollectionTaskUpdate, converter_to_response, \
20+
from app.module.collection.schema.collection import CollectionTaskBase, CollectionTaskCreate, CollectionTaskUpdate, CollectionConfig, converter_to_response, \
2021
convert_for_create, SyncMode
2122
from app.module.collection.schedule import schedule_collection_task, remove_collection_task
2223
from app.module.collection.service.collection import CollectionTaskService
@@ -300,12 +301,27 @@ async def update_task(
300301
reschedule_collection_task(task_id, task.schedule_expression)
301302

302303
if 'config' in update_data:
303-
# 重新生成任务配置文件
304+
# Get original config from database
305+
original_config = json.loads(task.config) if task.config else {}
306+
307+
# Preserve sensitive values if masked pattern detected
308+
preserved_config = preserve_sensitive_values(
309+
request.config.dict(),
310+
original_config
311+
)
312+
313+
# Regenerate task config file with preserved sensitive values
304314
template = await db.execute(select(CollectionTemplate).where(CollectionTemplate.id == task.template_id))
305315
template = template.scalar_one_or_none()
306316
if template:
307-
DataxClient.generate_datx_config(request.config, template, task.target_path)
308-
task.config = json.dumps(request.config.dict())
317+
# Use preserved config to generate DataX config
318+
DataxClient.generate_datx_config(
319+
CollectionConfig(**preserved_config),
320+
template,
321+
task.target_path
322+
)
323+
# Save preserved config to database
324+
task.config = json.dumps(preserved_config)
309325

310326
# 如果任务处于 FAILED 状态,修改后重置为 PENDING,允许重新执行
311327
if task.status == TaskStatus.FAILED.name:

runtime/datamate-python/app/module/collection/schema/collection.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
import uuid
33
from datetime import datetime
44
from enum import Enum
5-
from typing import Optional
5+
from typing import Optional, Dict, Any
66

7-
from pydantic import BaseModel, Field, validator, ConfigDict, field_validator
7+
from pydantic import BaseModel, Field, validator, ConfigDict, field_validator, field_serializer
88
from pydantic.alias_generators import to_camel
99

10+
from app.core.security import mask_sensitive_dict
1011
from app.db.models.data_collection import CollectionTask, TaskExecution, CollectionTemplate
1112
from app.module.dataset.schema import DatasetTypeResponse
1213
from app.module.dataset.schema.dataset import DatasetType
@@ -23,6 +24,13 @@ class CollectionConfig(BaseModel):
2324
writer: Optional[dict] = Field(None, description="writer参数")
2425
job: Optional[dict] = Field(None, description="任务配置")
2526

27+
@field_serializer('parameter', 'reader', 'writer', 'job', when_used='json')
28+
def mask_sensitive_fields(self, value: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
29+
"""Mask sensitive fields when serializing to JSON"""
30+
if value is None:
31+
return value
32+
return mask_sensitive_dict(value)
33+
2634
class CollectionTaskBase(BaseModel):
2735
id: str = Field(..., description="任务id")
2836
name: str = Field(..., description="任务名称")
@@ -93,6 +101,7 @@ def validate_timeout(cls, v):
93101
)
94102

95103
def converter_to_response(task: CollectionTask) -> CollectionTaskBase:
104+
config_dict = json.loads(task.config)
96105
return CollectionTaskBase(
97106
id=task.id,
98107
name=task.name,
@@ -101,7 +110,7 @@ def converter_to_response(task: CollectionTask) -> CollectionTaskBase:
101110
template_id=task.template_id,
102111
template_name=task.template_name,
103112
target_path=task.target_path,
104-
config=json.loads(task.config),
113+
config=CollectionConfig(**config_dict),
105114
schedule_expression=task.schedule_expression,
106115
status=task.status,
107116
retry_count=task.retry_count,

0 commit comments

Comments
 (0)