Skip to content

Commit 0896a60

Browse files
authored
Merge pull request lightspeed-core#673 from max-svistunov/lcore-579-anonymize-user-id
LCORE-579 Anonymize user ID in transcripts
2 parents 9a71a42 + 436755b commit 0896a60

2 files changed

Lines changed: 17 additions & 4 deletions

File tree

src/utils/transcripts.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import logging
1010
import os
1111
from pathlib import Path
12+
import hashlib
1213

1314
from configuration import configuration
1415
from models.requests import Attachment, QueryRequest
@@ -18,11 +19,17 @@
1819
logger = logging.getLogger("utils.transcripts")
1920

2021

22+
def _hash_user_id(user_id: str) -> str:
23+
"""Hash the user ID using SHA-256."""
24+
return hashlib.sha256(user_id.encode("utf-8")).hexdigest()
25+
26+
2127
def construct_transcripts_path(user_id: str, conversation_id: str) -> Path:
2228
"""Construct path to transcripts."""
2329
# these two normalizations are required by Snyk as it detects
2430
# this Path sanitization pattern
25-
uid = os.path.normpath("/" + user_id).lstrip("/")
31+
hashed_user_id = _hash_user_id(user_id)
32+
uid = os.path.normpath("/" + hashed_user_id).lstrip("/")
2633
cid = os.path.normpath("/" + conversation_id).lstrip("/")
2734
file_path = (
2835
configuration.user_data_collection_configuration.transcripts_storage or ""
@@ -59,13 +66,15 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional-
5966
transcripts_path = construct_transcripts_path(user_id, conversation_id)
6067
transcripts_path.mkdir(parents=True, exist_ok=True)
6168

69+
hashed_user_id = _hash_user_id(user_id)
70+
6271
data_to_store = {
6372
"metadata": {
6473
"provider": provider_id,
6574
"model": model_id,
6675
"query_provider": query_request.provider,
6776
"query_model": query_request.model,
68-
"user_id": user_id,
77+
"user_id": hashed_user_id,
6978
"conversation_id": conversation_id,
7079
"timestamp": datetime.now(UTC).isoformat(),
7180
},

tests/unit/utils/test_transcripts.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Unit tests for functions defined in utils.transcripts module."""
22

3+
import hashlib
34
from configuration import AppConfig
45
from models.requests import QueryRequest
56

@@ -39,11 +40,13 @@ def test_construct_transcripts_path(mocker):
3940

4041
user_id = "user123"
4142
conversation_id = "123e4567-e89b-12d3-a456-426614174000"
43+
hashed_user_id = hashlib.sha256(user_id.encode("utf-8")).hexdigest()
4244

4345
path = construct_transcripts_path(user_id, conversation_id)
4446

4547
assert (
46-
str(path) == "/tmp/transcripts/user123/123e4567-e89b-12d3-a456-426614174000"
48+
str(path)
49+
== f"/tmp/transcripts/{hashed_user_id}/123e4567-e89b-12d3-a456-426614174000"
4750
), "Path should be constructed correctly"
4851

4952

@@ -97,14 +100,15 @@ def test_store_transcript(mocker):
97100
)
98101

99102
# Assert that the transcript was stored correctly
103+
hashed_user_id = hashlib.sha256(user_id.encode("utf-8")).hexdigest()
100104
mock_json.dump.assert_called_once_with(
101105
{
102106
"metadata": {
103107
"provider": "fake-provider",
104108
"model": "fake-model",
105109
"query_provider": query_request.provider,
106110
"query_model": query_request.model,
107-
"user_id": user_id,
111+
"user_id": hashed_user_id,
108112
"conversation_id": conversation_id,
109113
"timestamp": mocker.ANY,
110114
},

0 commit comments

Comments
 (0)