Skip to content

Commit d11f54a

Browse files
qiancaiti-chi-bot
authored andcommitted
This is an automated cherry-pick of pingcap#22894
Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>
1 parent 06e2788 commit d11f54a

5 files changed

Lines changed: 215 additions & 10 deletions

File tree

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
use strict;
2+
use warnings;
3+
use File::Basename qw(dirname);
4+
use File::Path qw(make_path);
5+
6+
my ($out_root, $list_path) = @ARGV;
7+
die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path;
8+
9+
my %added_lines_by_file;
10+
my %has_link_candidate;
11+
my $file;
12+
13+
while (my $line = <STDIN>) {
14+
chomp $line;
15+
16+
if ($line =~ m{^\+\+\+ b/(.+)$}) {
17+
$file = $1;
18+
next;
19+
}
20+
21+
next unless defined $file;
22+
next unless $line =~ /^\+(?!\+\+)(.*)$/;
23+
24+
my $content = $1;
25+
push @{$added_lines_by_file{$file}}, $content;
26+
$has_link_candidate{$file} = 1 if $content =~ m{https?://}i || $content =~ /\bhref\s*=/i;
27+
}
28+
29+
make_path($out_root);
30+
open my $list_fh, ">", $list_path or die "cannot write $list_path: $!";
31+
32+
for my $file (sort keys %added_lines_by_file) {
33+
next unless $has_link_candidate{$file};
34+
next if $file =~ m{(?:^|/)\.\.(?:/|$)};
35+
36+
my $out_path = "$out_root/$file";
37+
make_path(dirname($out_path));
38+
open my $out_fh, ">", $out_path or die "cannot write $out_path: $!";
39+
for my $line (@{$added_lines_by_file{$file}}) {
40+
print {$out_fh} "$line\n";
41+
}
42+
close $out_fh;
43+
print {$list_fh} "$out_path\n";
44+
}
45+
46+
close $list_fh;
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
use strict;
2+
use warnings;
3+
use File::Basename qw(dirname);
4+
use File::Path qw(make_path);
5+
6+
my ($out_root, $list_path) = @ARGV;
7+
die "usage: $0 OUT_ROOT LIST_PATH\n" unless defined $out_root && defined $list_path;
8+
9+
my $site_base_url = $ENV{DOCS_SITE_BASE_URL};
10+
die "DOCS_SITE_BASE_URL is not set\n" unless defined $site_base_url && $site_base_url ne "";
11+
$site_base_url =~ s{/+\z}{};
12+
13+
make_path($out_root);
14+
open my $list_fh, ">", $list_path or die "cannot write $list_path: $!";
15+
16+
{
17+
local $/ = "\0";
18+
while (my $file = <STDIN>) {
19+
chomp $file;
20+
next if $file =~ m{(?:^|/)\.\.(?:/|$)};
21+
next unless -f $file;
22+
23+
open my $in_fh, "<", $file or die "cannot read $file: $!";
24+
my $content = do { local $/; <$in_fh> };
25+
close $in_fh;
26+
next unless defined $content;
27+
28+
my %seen;
29+
while ($content =~ /\bhref\s*=\s*(["'])(.*?)\1/gi) {
30+
my $href = $2;
31+
$href =~ s/^\s+|\s+$//g;
32+
next if $href eq "";
33+
next if $href =~ m{^https?://}i;
34+
next if $href =~ m{^(?:#|[a-z][a-z0-9+.-]*:)}i;
35+
36+
my $url;
37+
if ($href =~ m{^//}) {
38+
$url = "https:$href";
39+
} elsif ($href =~ m{^/}) {
40+
$url = "$site_base_url$href";
41+
} else {
42+
next;
43+
}
44+
$seen{$url} = 1;
45+
}
46+
47+
next unless %seen;
48+
my $out_path = "$out_root/$file";
49+
make_path(dirname($out_path));
50+
open my $out_fh, ">", $out_path or die "cannot write $out_path: $!";
51+
for my $url (sort keys %seen) {
52+
print {$out_fh} "<$url>\n";
53+
}
54+
close $out_fh;
55+
print {$list_fh} "$out_path\n";
56+
}
57+
}
58+
close $list_fh;
Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
name: Links (Fail Fast)
1+
name: ci / external-links-in-changed-lines (pull_request)
22

33
on:
44
pull_request:
55

6+
env:
7+
DOCS_SITE_BASE_URL: "https://docs.pingcap.com"
8+
9+
permissions:
10+
contents: read
11+
612
jobs:
713
linkChecker:
814
runs-on: ubuntu-latest
@@ -11,17 +17,49 @@ jobs:
1117
with:
1218
fetch-depth: 2
1319

14-
- name: 'Get a list of changed markdown files to process'
15-
id: changed-files
20+
- name: Collect changed markdown lines with links
21+
id: changed-lines
1622
run: |
17-
CHANGED_FILES=$(git diff-tree --name-only --diff-filter 'AM' -r HEAD^1 HEAD -- "*.md" | sed -z "s/\n$//;s/\n/' '/g")
18-
echo "all_changed_files=${CHANGED_FILES}" >> $GITHUB_OUTPUT
23+
git -c core.quotePath=false diff --unified=0 --diff-filter=AM --no-ext-diff --no-color HEAD^1 HEAD -- '*.md' |
24+
perl .github/scripts/extract-changed-markdown-lines.pl .lychee-pr-changed-lines .lychee-pr-inputs.txt
1925
26+
<<<<<<< HEAD
2027
- name: Link Checker
2128
if: ${{ steps.changed-files.outputs.all_changed_files }}
2229
uses: lycheeverse/lychee-action@v2.3.0
2330
with:
2431
fail: true
2532
args: --root-dir $(pwd) -E -i -n -t 45 -- '${{ steps.changed-files.outputs.all_changed_files }}'
33+
=======
34+
count=$(wc -l < .lychee-pr-inputs.txt | tr -d ' ')
35+
echo "count=${count}" >> "$GITHUB_OUTPUT"
36+
37+
if [ "$count" -gt 0 ]; then
38+
echo "has_inputs=true" >> "$GITHUB_OUTPUT"
39+
sed 's/^/- /' .lychee-pr-inputs.txt
40+
else
41+
echo "has_inputs=false" >> "$GITHUB_OUTPUT"
42+
fi
43+
44+
- name: Collect doc site href URLs
45+
if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }}
46+
run: |
47+
tr '\n' '\0' < .lychee-pr-inputs.txt |
48+
perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt
49+
50+
count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ')
51+
if [ "$count" -gt 0 ]; then
52+
cat .lychee-site-href-files.txt >> .lychee-pr-inputs.txt
53+
sed 's/^/- /' .lychee-site-href-files.txt
54+
fi
55+
56+
- name: Link Checker
57+
if: ${{ steps.changed-lines.outputs.has_inputs == 'true' }}
58+
uses: lycheeverse/lychee-action@v2
59+
with:
60+
fail: true
61+
failIfEmpty: false
62+
args: --root-dir $(pwd) --exclude '^file://' -E -i -n -t 45 --files-from .lychee-pr-inputs.txt
63+
>>>>>>> 145d861113 (workflow: optimize external link checks (#22894))
2664
env:
2765
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}

.github/workflows/link.yaml

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
1-
name: Links
1+
name: Check external URLs in all files
22

33
on:
44
repository_dispatch:
55
workflow_dispatch:
66
schedule:
77
- cron: "0 0 * * 1"
88

9+
env:
10+
DOCS_SITE_BASE_URL: "https://docs.pingcap.com"
11+
12+
permissions:
13+
contents: read
14+
issues: write
15+
916
jobs:
1017
linkChecker:
1118
runs-on: ubuntu-latest
@@ -14,20 +21,69 @@ jobs:
1421

1522
- name: Download Exclude Path
1623
run: |
17-
curl https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore
24+
curl -fsSL https://raw.githubusercontent.com/pingcap/docs/master/.lycheeignore --output .lycheeignore
1825
1926
- name: Check Links
2027
uses: lycheeverse/lychee-action@v1.6.1
2128
with:
2229
# Don't fail as we want the workflow to continue and run 'Create Issue From File'
23-
# Excluding releases paths as historic releases may have outdated links.
2430
fail: false
2531
failIfEmpty: false
26-
args: --root-dir $(pwd) --cache --max-cache-age 8d -E -i -n -t 45 --exclude-path '^./releases/' --exclude-path '^./tidb-cloud/releases/' --exclude-path '^./resources/' .
27-
output: out.md
32+
args: --root-dir $(pwd) --cache --max-cache-age 8d --cache-exclude-status '..200,300..' --exclude '^file://' -E -i -n -t 45 --exclude-path '^\./releases/' --exclude-path '^\./tidb-cloud/releases/' --exclude-path '^\./resources/' .
33+
output: out-external.md
34+
env:
35+
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
36+
37+
- name: Collect doc site href URLs
38+
id: site-hrefs
39+
run: |
40+
git ls-files -z -- \
41+
'*.md' '*.mdx' '*.markdown' '*.mkd' '*.mdown' '*.mdwn' '*.mkdn' '*.mkdown' \
42+
'*.html' '*.htm' '*.css' '*.txt' |
43+
perl -0ne 'print unless m{^(?:releases|tidb-cloud/releases|resources)/}' |
44+
perl .github/scripts/extract-site-hrefs.pl .lychee-site-hrefs .lychee-site-href-files.txt
45+
46+
count=$(wc -l < .lychee-site-href-files.txt | tr -d ' ')
47+
echo "count=${count}" >> "$GITHUB_OUTPUT"
48+
49+
if [ "$count" -gt 0 ]; then
50+
echo "has_hrefs=true" >> "$GITHUB_OUTPUT"
51+
sed 's/^/- /' .lychee-site-href-files.txt
52+
else
53+
echo "has_hrefs=false" >> "$GITHUB_OUTPUT"
54+
fi
55+
56+
- name: Check site href URLs
57+
if: ${{ steps.site-hrefs.outputs.has_hrefs == 'true' }}
58+
uses: lycheeverse/lychee-action@v2
59+
with:
60+
# Don't fail as we want the workflow to continue and run 'Create Issue From File'
61+
fail: false
62+
failIfEmpty: false
63+
args: --cache --max-cache-age 8d --cache-exclude-status '..200,300..' -E -i -n -t 45 --files-from .lychee-site-href-files.txt
64+
output: out-site-hrefs.md
2865
env:
2966
GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
3067

68+
- name: Combine Link Reports
69+
run: |
70+
{
71+
echo "# External URL Check"
72+
echo
73+
if [ -f out-external.md ]; then
74+
cat out-external.md
75+
else
76+
echo "*(external link check did not produce output)*"
77+
fi
78+
79+
if [ -f out-site-hrefs.md ]; then
80+
echo
81+
echo "# Site href URL Check"
82+
echo
83+
cat out-site-hrefs.md
84+
fi
85+
} > out.md
86+
3187
- name: Create Issue From File
3288
uses: peter-evans/create-issue-from-file@v4
3389
with:

.lycheeignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,21 @@ https://platform\.openai\.com/api-keys
3333
https://openai\.com/.*
3434
https://jwt\.io/
3535
https://typeorm\.io/.*
36+
https://dl\.acm\.org/doi/10\.1145/(1988842\.1988850|2588555\.2610507)
37+
https://developer\.salesforce\.com/.*
38+
https?://(www\.)?npmjs\.com/package/.*
3639
https://dash\.cloudflare\.com/.*
3740
https://centminmod\.com/mydumper\.html
3841
https://docs\.pingcap\.com/tidb/v6\.6/system-variables#tidb_pessimistic_txn_aggressive_locking-new-in-v660
3942
https://docs\.pingcap\.com/tidb/v7\.6/system-variables#tidb_ddl_version-new-in-v760
4043
https://developers\.redhat\.com/blog/2021/01/05/building-red-hat-enterprise-linux-9-for-the-x86-64-v2-microarchitecture-level
44+
<<<<<<< HEAD
4145
https://portal\.azure\.com/.*
4246
https://.*github.*/%7B%7B%7B%20.tidb_operator_version%20%7D%7D%7D
47+
=======
48+
>>>>>>> 145d861113 (workflow: optimize external link checks (#22894))
4349
https://.*github.*/%7B%7B%7B.tidb-operator-version%7D%7D%7D
50+
https://console\.cloud\.google\.com/.*
4451
https://portal\.azure\.com/.*
4552
https://azuremarketplace\.microsoft\.com/.*
4653
https://one\.newrelic\.com/.*

0 commit comments

Comments
 (0)