-
Notifications
You must be signed in to change notification settings - Fork 16
58 lines (58 loc) · 2.3 KB
/
archive.yml
File metadata and controls
58 lines (58 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
name: Archive.org Saver
on:
workflow_dispatch:
schedule:
- cron: "40 3 26 5,11 *"
jobs:
Archive:
env:
LOGFILE: "0_Links.txt"
GH_TOKEN: ${{ secrets.BUILD_ACTION_TOKEN }}
runs-on: ubuntu-latest
steps:
- name: Checkout the Code
uses: actions/checkout@v4
with:
ref: main
- name: Install Dependencies
shell: bash
run: |
cd scripts/archivable_urls
RUNID=$(gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/buddhist-uni/buddhist-uni.github.io/actions/workflows/9334935/runs" -q '.workflow_runs[0].id')
echo "Last runid was $RUNID"
gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/buddhist-uni/buddhist-uni.github.io/actions/runs/$RUNID/logs" > logs.zip
unzip logs.zip "$LOGFILE"
mv "$LOGFILE" "lycheeout.txt"
python extracturls.py
if [[ -s urls.txt ]]; then
echo " urls.txt created"
else
echo "::error file=extracturls.py,line=8,title=No URLs Found::Failed to extract URLs from Lychee logs (lychee output changed format?)"
exit 1
fi
python filterurls.py # creates scripts/archivable_urls/filteredurls.txt
if [[ -s filteredurls.txt ]]; then
echo " filteredurls.txt created"
else
echo "::error file=filterurls.py,title=No URLs Found::Failed to find any URLs worth saving in urls.txt"
exit 1
fi
cd ~
printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > archive.org.auth
if [[ -s archive.org.auth ]]; then
echo "archive.org.auth created"
else
echo "::error title=No ARCHIVE_ORG_AUTH Secret::Failed to find ARCHIVE_ORG_AUTH secret"
exit 1
fi
pip install tqdm titlecase pyyaml
- name: Archive Archivable External Links
shell: bash
run: |
cd scripts
python -c "from archivedotorg import *; urls = Path('archivable_urls/filteredurls.txt').read_text().split(); archive_urls(urls)"
# Common Crawl seems to be doing a good enough job
# - name: Archive Internal Pages
# run: |
# cd scripts
# python archive_site.py