Skip to content

Commit 9586a6a

Browse files
B4nanclaude
andauthored
docs: serve markdown via Accept header in nginx (apify#3542)
## Summary - Nginx now serves markdown when `Accept: text/markdown` or `text/plain` is in the request header (appends `.md` to proxied path), for both JS and Python docs - Root paths (`/`, `/python`) redirect to `llms.txt` when markdown is requested - Adds open redirect protection for trailing slash redirects - CI assertions verify Content-Type headers for both doc sources Ported from apify/apify-docs nginx config. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent a2d34a4 commit 9586a6a

File tree

2 files changed

+172
-2
lines changed

2 files changed

+172
-2
lines changed

.github/workflows/test-ci.yml

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,103 @@ jobs:
133133
APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}
134134
SEGMENT_TOKEN: ${{ secrets.SEGMENT_TOKEN }}
135135

136+
- name: Install Nginx
137+
run: |
138+
sudo apt-get update
139+
sudo apt-get install -y nginx
140+
141+
- name: Start Docusaurus server
142+
run: |
143+
cd website
144+
nohup yarn docusaurus serve --port 3000 --no-open &
145+
sleep 5
146+
curl -f http://localhost:3000 > /dev/null
147+
148+
- name: Start Nginx with project config
149+
run: |
150+
PWD_PATH="$(pwd)"
151+
cat > default.conf <<EOF
152+
worker_processes auto;
153+
error_log ${PWD_PATH}/logs/error.log;
154+
pid ${PWD_PATH}/logs/nginx.pid;
155+
events {}
156+
http {
157+
access_log ${PWD_PATH}/logs/access.log;
158+
include ${PWD_PATH}/website/nginx.conf;
159+
}
160+
EOF
161+
sed -i 's|https://apify.github.io/crawlee|http://localhost:3000|g' default.conf
162+
mkdir -p "${PWD_PATH}/logs"
163+
nginx -c "${PWD_PATH}/default.conf"
164+
sleep 1
165+
166+
- name: Run header assertions
167+
run: |
168+
set -euo pipefail
169+
function assert_header() {
170+
url=$1
171+
header=$2
172+
expected=$3
173+
shift 3
174+
extra_args=("$@")
175+
actual=$(curl -s -D - -o /dev/null "${extra_args[@]}" "$url" | grep -i "^$header" | tr -d '\r' || true)
176+
echo "→ $url → $actual"
177+
echo "$actual" | grep -q "$expected" || (echo "❌ Expected '$expected' in '$header' for $url" && exit 1)
178+
}
179+
180+
function assert_status() {
181+
url=$1
182+
expected=$2
183+
shift 2
184+
extra_args=("$@")
185+
actual=$(curl -s -o /dev/null -w "%{http_code}" "${extra_args[@]}" "$url")
186+
echo "→ $url → HTTP $actual"
187+
[ "$actual" = "$expected" ] || (echo "❌ Expected HTTP $expected but got $actual for $url" && exit 1)
188+
}
189+
190+
function assert_no_redirect() {
191+
url=$1
192+
shift
193+
extra_args=("$@")
194+
response=$(curl -s -D - -o /dev/null -w "\n%{http_code}" "${extra_args[@]}" "$url" 2>/dev/null)
195+
status=$(echo "$response" | tail -1)
196+
location=$(echo "$response" | grep -i "^location:" | tr -d '\r' || true)
197+
echo "→ $url → HTTP $status ${location:+(${location})}"
198+
if [ "$status" = "301" ] || [ "$status" = "302" ]; then
199+
echo "❌ Got redirect for $url: $location" && exit 1
200+
fi
201+
}
202+
203+
echo "🧪 Checking open redirect protection..."
204+
assert_no_redirect "http://localhost:8080///%5Cevil.com/"
205+
assert_no_redirect "http://localhost:8080/%5Cevil.com/"
206+
assert_no_redirect "http://localhost:8080///%5cevil.com/"
207+
assert_no_redirect "http://localhost:8080" --request-target '/\evil.com/'
208+
assert_no_redirect "http://localhost:8080" --request-target '///\evil.com/'
209+
assert_status "http://localhost:8080/js/docs/quick-start/" "302"
210+
211+
echo "🧪 Checking Nginx responses... (crawlee JS)"
212+
assert_header "http://localhost:8080/" "Content-Type" "text/html"
213+
assert_header "http://localhost:8080/" "Content-Type" "text/markdown" -H "Accept: text/markdown"
214+
assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/html"
215+
assert_header "http://localhost:8080/js/docs/quick-start.md" "Content-Type" "text/markdown"
216+
assert_header "http://localhost:8080/js/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown"
217+
assert_header "http://localhost:8080/llms.txt" "Content-Type" "text/markdown"
218+
assert_header "http://localhost:8080/llms-full.txt" "Content-Type" "text/markdown"
219+
220+
echo "🧪 Checking Nginx responses... (crawlee Python)"
221+
assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/html"
222+
assert_header "http://localhost:8080/python/docs/quick-start.md" "Content-Type" "text/markdown"
223+
assert_header "http://localhost:8080/python/docs/quick-start" "Content-Type" "text/markdown" -H "Accept: text/markdown"
224+
assert_header "http://localhost:8080/python/llms.txt" "Content-Type" "text/markdown"
225+
assert_header "http://localhost:8080/python/llms-full.txt" "Content-Type" "text/markdown"
226+
227+
echo "✅ All Nginx header checks passed."
228+
229+
- name: Stop Nginx
230+
if: always()
231+
run: nginx -c "$(pwd)/default.conf" -s stop
232+
136233
lint:
137234
name: Lint
138235
runs-on: ubuntu-22.04

website/nginx.conf

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,96 @@
1+
map $http_accept $serve_markdown {
2+
default 0;
3+
~*text/plain 1;
4+
~*text/markdown 1;
5+
}
6+
7+
map $request_uri $has_no_extension {
8+
~^[^.]*$ 1;
9+
default 0;
10+
}
11+
112
# Nginx reverse proxy configuration for crawlee.dev
213
# Routes to GitHub Pages and handles legacy URL redirects
314
server {
415
listen 0.0.0.0:8080;
516
server_name 'crawlee.dev';
617

18+
# comment out the resolver and use localhost:3000 for local development
19+
set $backend "https://apify.github.io/crawlee";
20+
resolver 1.1.1.1 8.8.8.8 valid=30s ipv6=off;
21+
722
# Health check endpoint
823
location /health {
924
access_log off;
1025
return 200 '{"status":"UP"}';
1126
add_header Content-Type application/json;
1227
}
1328

29+
location = / {
30+
if ($serve_markdown) {
31+
rewrite ^ /llms.txt last;
32+
}
33+
proxy_pass $backend/;
34+
}
35+
36+
location ~ ^/(llms|llms-full)\.txt$ {
37+
proxy_hide_header Content-Type;
38+
add_header Content-Type 'text/markdown; charset=utf-8' always;
39+
proxy_pass $backend$uri;
40+
}
41+
42+
# remove trailing slashes from all URLs (except root /)
43+
# exact match locations (e.g., location = /python/) take priority over this regex
44+
# Only match URIs composed of safe characters (letters, digits, dots, hyphens,
45+
# underscores, forward slashes). This prevents open redirect via %5C (backslash):
46+
# nginx decodes %5C to \ in $uri, and \ in the Location header gets normalized
47+
# to / by browsers, turning /\evil.com into //evil.com (protocol-relative URL).
48+
location ~ ^(/[a-zA-Z0-9][a-zA-Z0-9_./-]*)/$ {
49+
rewrite ^(.+)/$ $1$is_args$args? redirect;
50+
}
51+
1452
location / {
15-
proxy_pass https://apify.github.io/crawlee/;
53+
set $rewrite_condition "$serve_markdown$has_no_extension";
54+
set $proxy_path $request_uri;
55+
56+
if ($rewrite_condition = "11") {
57+
set $proxy_path "${request_uri}.md";
58+
}
59+
proxy_pass $backend$proxy_path;
60+
}
61+
62+
### Repository path: "/python"
63+
64+
location = /python {
65+
if ($serve_markdown) {
66+
rewrite ^ /python/llms.txt last;
67+
}
68+
proxy_pass https://apify.github.io/crawlee-python/;
1669
}
17-
location /python {
70+
71+
location = /python/ {
72+
if ($serve_markdown) {
73+
rewrite ^ /python/llms.txt last;
74+
}
1875
proxy_pass https://apify.github.io/crawlee-python/;
1976
}
2077

78+
location ~ ^/python/(llms|llms-full)\.txt$ {
79+
proxy_hide_header Content-Type;
80+
add_header Content-Type 'text/markdown; charset=utf-8' always;
81+
proxy_pass https://apify.github.io/crawlee-python/$1.txt;
82+
}
83+
84+
location ~ ^/python/(.*)$ {
85+
set $path_suffix $1;
86+
set $proxy_path "/$path_suffix";
87+
set $rewrite_condition "$serve_markdown$has_no_extension";
88+
if ($rewrite_condition = "11") {
89+
set $proxy_path "${proxy_path}.md";
90+
}
91+
proxy_pass https://apify.github.io/crawlee-python$proxy_path;
92+
}
93+
2194
# So that we can have both GH pages and crawlee.dev/python working and loading assets from the same path
2295
location /crawlee-python {
2396
proxy_pass https://apify.github.io/crawlee-python/;

0 commit comments

Comments
 (0)