Skip to content

Commit a827da8

Browse files
committed
Strip encrypted thinking signatures and unknown-block JSON dumps from output
The parser used to render any unrecognized content block by JSON-dumping it whole. This leaked extended-thinking 'signature' fields — multi-kilobyte base64 blobs that are encrypted attestation data, useless to humans, and noisy to the point of making exported sessions unreadable. - thinking blocks with empty body are skipped entirely - thinking blocks with visible reasoning text are preserved and wrapped in italic <thinking> markers so they're distinguishable from the assistant reply - redacted_thinking blocks are dropped (encrypted-only) - image blocks render as [image] (no base64) - unknown future block types render as a short [type] placeholder, not a JSON dump Adds 5 tests covering the noise-stripping behaviour. 55/55 pass.
1 parent 9d927a5 commit a827da8

2 files changed

Lines changed: 99 additions & 12 deletions

File tree

claude_backup/parser.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -137,25 +137,44 @@ def extract_session_title(jsonl_path: Path) -> str:
137137

138138

139139
def _normalize_content(content) -> str:
140-
"""Flatten content into a string. Handles list-of-blocks shape too."""
140+
"""Flatten Claude content blocks into plain text.
141+
142+
Handles: plain strings, text blocks, tool_use, tool_result, and image blocks.
143+
Skips empty extended-thinking blocks (which contain only encrypted signatures —
144+
huge base64 blobs that aren't useful to humans). Non-empty thinking text is
145+
preserved in italics.
146+
"""
141147
if isinstance(content, str):
142148
return content
143149
if isinstance(content, list):
144150
parts: list[str] = []
145151
for block in content:
146152
if isinstance(block, str):
147153
parts.append(block)
148-
elif isinstance(block, dict):
149-
if "text" in block and isinstance(block["text"], str):
150-
parts.append(block["text"])
151-
elif block.get("type") == "tool_use":
152-
name = block.get("name", "tool")
153-
parts.append(f"[tool_use: {name}]")
154-
elif block.get("type") == "tool_result":
155-
inner = block.get("content", "")
156-
parts.append(_normalize_content(inner))
157-
else:
158-
parts.append(json.dumps(block, ensure_ascii=False))
154+
continue
155+
if not isinstance(block, dict):
156+
continue
157+
block_type = block.get("type")
158+
if block_type == "text" and isinstance(block.get("text"), str):
159+
parts.append(block["text"])
160+
elif "text" in block and isinstance(block["text"], str) and not block_type:
161+
parts.append(block["text"])
162+
elif block_type == "tool_use":
163+
name = block.get("name", "tool")
164+
parts.append(f"[tool_use: {name}]")
165+
elif block_type == "tool_result":
166+
inner = block.get("content", "")
167+
parts.append(_normalize_content(inner))
168+
elif block_type == "thinking":
169+
thought = block.get("thinking", "")
170+
if isinstance(thought, str) and thought.strip():
171+
parts.append(f"_<thinking>_\n{thought}\n_</thinking>_")
172+
elif block_type == "redacted_thinking":
173+
continue
174+
elif block_type == "image":
175+
parts.append("[image]")
176+
elif block_type:
177+
parts.append(f"[{block_type}]")
159178
return "\n".join(p for p in parts if p)
160179
if content is None:
161180
return ""

tests/test_parser.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,71 @@ def test_session_summary(fake_project_path: Path) -> None:
106106
def test_message_dataclass_default_raw() -> None:
107107
m = Message(role="user", content="x")
108108
assert m.raw == {}
109+
110+
111+
def test_thinking_block_with_empty_body_is_skipped(tmp_path: Path) -> None:
112+
"""Encrypted thinking signatures must never leak into the output."""
113+
f = tmp_path / "x.jsonl"
114+
f.write_text(
115+
'{"role":"assistant","content":['
116+
'{"type":"thinking","thinking":"","signature":"EpsMSECRET" }'
117+
',{"type":"text","text":"hello"}'
118+
']}\n'
119+
)
120+
messages = parse_session(f)
121+
assert messages[0].content == "hello"
122+
assert "EpsM" not in messages[0].content
123+
assert "signature" not in messages[0].content
124+
125+
126+
def test_thinking_block_with_visible_text_is_preserved(tmp_path: Path) -> None:
127+
f = tmp_path / "x.jsonl"
128+
f.write_text(
129+
'{"role":"assistant","content":['
130+
'{"type":"thinking","thinking":"weighing tradeoffs","signature":"sig"}'
131+
',{"type":"text","text":"answer"}'
132+
']}\n'
133+
)
134+
messages = parse_session(f)
135+
assert "weighing tradeoffs" in messages[0].content
136+
assert "answer" in messages[0].content
137+
assert "sig" not in messages[0].content
138+
139+
140+
def test_redacted_thinking_block_is_skipped(tmp_path: Path) -> None:
141+
f = tmp_path / "x.jsonl"
142+
f.write_text(
143+
'{"role":"assistant","content":['
144+
'{"type":"redacted_thinking","data":"opaque-encrypted-blob"}'
145+
',{"type":"text","text":"visible"}'
146+
']}\n'
147+
)
148+
messages = parse_session(f)
149+
assert messages[0].content == "visible"
150+
assert "opaque" not in messages[0].content
151+
152+
153+
def test_image_block_renders_placeholder(tmp_path: Path) -> None:
154+
f = tmp_path / "x.jsonl"
155+
f.write_text(
156+
'{"role":"user","content":['
157+
'{"type":"image","source":{"type":"base64","data":"BASE64HUGE"}}'
158+
',{"type":"text","text":"check this"}'
159+
']}\n'
160+
)
161+
messages = parse_session(f)
162+
assert "[image]" in messages[0].content
163+
assert "check this" in messages[0].content
164+
assert "BASE64" not in messages[0].content
165+
166+
167+
def test_unknown_block_type_renders_short_placeholder(tmp_path: Path) -> None:
168+
f = tmp_path / "x.jsonl"
169+
f.write_text(
170+
'{"role":"assistant","content":['
171+
'{"type":"some_future_block","payload":{"big":"data"}}'
172+
']}\n'
173+
)
174+
messages = parse_session(f)
175+
assert messages[0].content == "[some_future_block]"
176+
assert "payload" not in messages[0].content

0 commit comments

Comments
 (0)