-
Notifications
You must be signed in to change notification settings - Fork 7
129 lines (109 loc) · 4.04 KB
/
Copy pathcrawl_proxy_blocks.yml
File metadata and controls
129 lines (109 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
name: Crawl Wikipedia Proxy Blocks
on:
repository_dispatch:
workflow_dispatch:
schedule:
# Run daily at 00:22 UTC
- cron: '22 0 * * *'
jobs:
crawl:
runs-on: ubuntu-latest
permissions:
contents: write
actions: write
timeout-minutes: 120
# Only run on the original repository, not forks
if: github.repository == 'networkcats/OpenProxyDB'
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.14'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Fetch data files from data branch
run: |
git fetch origin data:data 2>/dev/null || true
if git show data:last_crawl_time.txt >/dev/null 2>&1; then
git show data:last_crawl_time.txt > last_crawl_time.txt
echo "Loaded last_crawl_time.txt from data branch"
else
echo "No last_crawl_time.txt found in data branch"
fi
if git show data:block_metadata.csv >/dev/null 2>&1; then
git show data:block_metadata.csv > block_metadata.csv
echo "Loaded block_metadata.csv from data branch"
else
echo "No block_metadata.csv found in data branch"
fi
- name: Run crawler
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.crawler
- name: Set tag name
run: |
echo "TAG_NAME=$(date +"%Y.%m.%d")" >> $GITHUB_ENV
- name: Push data files to data branch
run: |
git config user.name "${{ github.actor }}"
git config user.email "${{ github.actor }}@users.noreply.github.com"
# Save the file contents
CRAWL_TIME=$(cat last_crawl_time.txt)
METADATA_CSV=$(cat block_metadata.csv)
# Remove the files to avoid checkout conflict
rm -f last_crawl_time.txt block_metadata.csv
# Check if data branch exists remotely
if git ls-remote --exit-code --heads origin data >/dev/null 2>&1; then
# Branch exists, fetch and checkout
git fetch origin data
git checkout -B data origin/data
else
# Create orphan branch
git checkout --orphan data
git rm -rf . >/dev/null 2>&1 || true
fi
# Write the files
echo "$CRAWL_TIME" > last_crawl_time.txt
echo "$METADATA_CSV" > block_metadata.csv
# Commit with amend to keep single commit
git add last_crawl_time.txt block_metadata.csv
if git rev-parse HEAD >/dev/null 2>&1; then
git commit --amend -m "Update data files"
else
git commit -m "Update data files"
fi
# Force push
git push --force origin data
- name: Upload to Releases
uses: softprops/action-gh-release@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ env.TAG_NAME }}
name: Proxy Blocks ${{ env.TAG_NAME }}
body: |
Proxy block list updated on ${{ env.TAG_NAME }}
- New IPs added: ${{ env.IPS_ADDED }}
- IPs removed: ${{ env.IPS_REMOVED }}
- Total IPs: ${{ env.TOTAL_IPS }}
This release contains IP addresses blocked by Wikipedia for proxy-related reasons.
files: proxy_blocks.csv
- name: Remove old Releases
uses: dev-drprasad/delete-older-releases@master
with:
keep_latest: 2
delete_tags: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Delete old workflow runs
uses: Mattraks/delete-workflow-runs@main
with:
retain_days: 0
keep_minimum_runs: 2