forked from yusufkaraaslan/Skill_Seekers
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_llms_txt_downloader.py
More file actions
296 lines (208 loc) · 9.92 KB
/
test_llms_txt_downloader.py
File metadata and controls
296 lines (208 loc) · 9.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
from unittest.mock import Mock, patch
import requests
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
def test_successful_download():
"""Test successful download with valid markdown content"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
mock_response.text = (
"# Header\n\nSome content with markdown patterns.\n\n## Subheader\n\n- List item\n- Another item\n\n```python\ncode_block()\n```\n"
+ "x" * 200
)
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response) as mock_get:
content = downloader.download()
assert content is not None
assert len(content) > 100
assert isinstance(content, str)
assert "# Header" in content
mock_get.assert_called_once()
def test_timeout_with_retry():
"""Test timeout scenario with retry logic"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
with (
patch("requests.get", side_effect=requests.Timeout("Connection timeout")) as mock_get,
patch("time.sleep") as mock_sleep,
): # Mock sleep to speed up test
content = downloader.download()
assert content is None
assert mock_get.call_count == 2 # Should retry once (2 total attempts)
assert mock_sleep.call_count == 1 # Should sleep once between retries
def test_empty_content_rejection():
"""Test rejection of content shorter than 100 chars"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
mock_response.text = "# Short"
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response):
content = downloader.download()
assert content is None
def test_non_markdown_rejection():
"""Test rejection of content that doesn't look like markdown"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
mock_response.text = "Plain text without any markdown patterns at all. " * 10
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response):
content = downloader.download()
assert content is None
def test_http_error_handling():
"""Test handling of HTTP errors (404, 500, etc.)"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=2)
mock_response = Mock()
mock_response.raise_for_status.side_effect = requests.HTTPError("404 Not Found")
with (
patch("requests.get", return_value=mock_response) as mock_get,
patch("time.sleep"),
):
content = downloader.download()
assert content is None
assert mock_get.call_count == 2 # Should retry once
def test_exponential_backoff():
"""Test that exponential backoff delays are correct"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=3)
with (
patch("requests.get", side_effect=requests.Timeout("Connection timeout")),
patch("time.sleep") as mock_sleep,
):
content = downloader.download()
assert content is None
# Should sleep with delays: 1s, 2s (2^0, 2^1)
assert mock_sleep.call_count == 2
mock_sleep.assert_any_call(1) # First retry delay
mock_sleep.assert_any_call(2) # Second retry delay
def test_markdown_validation():
"""Test markdown pattern detection"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
# Test various markdown patterns
assert downloader._is_markdown("# Header")
assert downloader._is_markdown("## Subheader")
assert downloader._is_markdown("```code```")
assert downloader._is_markdown("- list item")
assert downloader._is_markdown("* bullet point")
assert downloader._is_markdown("`inline code`")
# Test non-markdown content
assert not downloader._is_markdown("Plain text without any markdown patterns")
def test_custom_timeout():
"""Test custom timeout parameter"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt", timeout=10)
mock_response = Mock()
mock_response.text = "# Header\n\nContent " * 50
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response) as mock_get:
content = downloader.download()
assert content is not None
# Verify timeout was passed to requests.get
call_kwargs = mock_get.call_args[1]
assert call_kwargs["timeout"] == 10
def test_custom_max_retries():
"""Test custom max_retries parameter"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt", max_retries=5)
with (
patch("requests.get", side_effect=requests.Timeout("Connection timeout")) as mock_get,
patch("time.sleep"),
):
content = downloader.download()
assert content is None
assert mock_get.call_count == 5 # Should attempt 5 times
def test_user_agent_header():
"""Test that custom user agent is set"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
mock_response.text = "# Header\n\nContent " * 50
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response) as mock_get:
content = downloader.download()
assert content is not None
# Verify custom user agent was passed
call_kwargs = mock_get.call_args[1]
assert call_kwargs["headers"]["User-Agent"] == "Skill-Seekers-llms.txt-Reader/1.0"
def test_get_proper_filename():
"""Test filename conversion from .txt to .md"""
downloader = LlmsTxtDownloader("https://hono.dev/llms-full.txt")
filename = downloader.get_proper_filename()
assert filename == "llms-full.md"
assert not filename.endswith(".txt")
def test_get_proper_filename_standard():
"""Test standard variant naming"""
downloader = LlmsTxtDownloader("https://hono.dev/llms.txt")
filename = downloader.get_proper_filename()
assert filename == "llms.md"
def test_get_proper_filename_small():
"""Test small variant naming"""
downloader = LlmsTxtDownloader("https://hono.dev/llms-small.txt")
filename = downloader.get_proper_filename()
assert filename == "llms-small.md"
def test_is_markdown_rejects_html_doctype():
"""Test that HTML with DOCTYPE is rejected (prevents redirect trap)"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html = (
"<!DOCTYPE html><html><head><title>Product Page</title></head><body>Content</body></html>"
)
assert not downloader._is_markdown(html)
# Test case-insensitive
html_uppercase = "<!DOCTYPE HTML><HTML><BODY>Content</BODY></HTML>"
assert not downloader._is_markdown(html_uppercase)
def test_is_markdown_rejects_html_tag():
"""Test that HTML with <html> tag is rejected (prevents redirect trap)"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html = '<html><head><meta charset="utf-8"></head><body>Content</body></html>'
assert not downloader._is_markdown(html)
# Test with just opening tag
html_partial = "<html><head>Some content"
assert not downloader._is_markdown(html_partial)
def test_is_markdown_rejects_html_meta():
"""Test that HTML with <meta> or <head> tags is rejected"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
html_with_head = "<head><title>Page</title></head><body>Content</body>"
assert not downloader._is_markdown(html_with_head)
html_with_meta = '<meta charset="utf-8"><meta name="viewport" content="width=device-width">'
assert not downloader._is_markdown(html_with_meta)
def test_is_markdown_accepts_markdown_with_html_words():
"""Test that markdown mentioning 'html' word is still accepted"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
markdown = "# Guide\n\nLearn about html tags in markdown. You can write HTML inside markdown."
assert downloader._is_markdown(markdown)
# Test with actual markdown patterns
markdown_with_code = "# HTML Tutorial\n\n```html\n<div>example</div>\n```\n\n## More content"
assert downloader._is_markdown(markdown_with_code)
def test_html_detection_only_scans_first_500_chars():
"""Test that HTML detection only scans first 500 characters for performance"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
# HTML tag after 500 chars should not be detected
safe_markdown = "# Header\n\n" + ("Valid markdown content. " * 50) + "\n\n<!DOCTYPE html>"
# This should pass because <!DOCTYPE html> is beyond first 500 chars
if len(safe_markdown[:500]) < len("<!DOCTYPE html>"):
# If the HTML is within 500 chars, adjust test
assert not downloader._is_markdown(safe_markdown)
else:
# HTML beyond 500 chars should not trigger rejection
assert downloader._is_markdown(safe_markdown)
def test_html_redirect_trap_scenario():
"""Test real-world scenario: llms.txt redirects to HTML product page"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
# Simulate Claude Code redirect scenario (302 to HTML page)
html_product_page = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Claude Code - Product Page</title>
</head>
<body>
<h1>Claude Code</h1>
<p>Product information...</p>
</body>
</html>"""
# Should reject this HTML even though it has <h1> tag (looks like markdown "# ")
assert not downloader._is_markdown(html_product_page)
def test_download_rejects_html_redirect():
"""Test that download() properly rejects HTML redirects"""
downloader = LlmsTxtDownloader("https://example.com/llms.txt")
mock_response = Mock()
# Simulate server returning HTML instead of markdown
mock_response.text = "<!DOCTYPE html><html><body><h1>Product Page</h1></body></html>"
mock_response.raise_for_status = Mock()
with patch("requests.get", return_value=mock_response):
content = downloader.download()
# Should return None (rejected as non-markdown)
assert content is None