buddhist-uni.github.io/.github/workflows/archive.yml at main · buddhist-uni/buddhist-uni.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
name: Archive.org Saver
on:
    workflow_dispatch:
    schedule:
        - cron: "40 3 26 5,11 *"
jobs:
  Archive:
    env:
      LOGFILE: "0_Links.txt"
      GH_TOKEN: ${{ secrets.BUILD_ACTION_TOKEN }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout the Code
        uses: actions/checkout@v4
        with:
          ref: main
      - name: Install Dependencies
        shell: bash
        run: |
          cd scripts/archivable_urls
          RUNID=$(gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/buddhist-uni/buddhist-uni.github.io/actions/workflows/9334935/runs" -q '.workflow_runs[0].id')
          echo "Last runid was $RUNID"
          gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/repos/buddhist-uni/buddhist-uni.github.io/actions/runs/$RUNID/logs" > logs.zip
          unzip logs.zip "$LOGFILE"
          mv "$LOGFILE" "lycheeout.txt"
          python extracturls.py
          if [[ -s urls.txt ]]; then
            echo "  urls.txt created"
          else
            echo "::error file=extracturls.py,line=8,title=No URLs Found::Failed to extract URLs from Lychee logs (lychee output changed format?)"
            exit 1
          fi
          python filterurls.py # creates scripts/archivable_urls/filteredurls.txt
          if [[ -s filteredurls.txt ]]; then
            echo "  filteredurls.txt created"
          else
            echo "::error file=filterurls.py,title=No URLs Found::Failed to find any URLs worth saving in urls.txt"
            exit 1
          fi
          cd ~
          printf "${{ secrets.ARCHIVE_ORG_AUTH }}" > archive.org.auth
          if [[ -s archive.org.auth ]]; then
            echo "archive.org.auth created"
          else
            echo "::error title=No ARCHIVE_ORG_AUTH Secret::Failed to find ARCHIVE_ORG_AUTH secret"
            exit 1
          fi
          pip install tqdm titlecase pyyaml
      - name: Archive Archivable External Links
        shell: bash
        run: |
            cd scripts
            python -c "from archivedotorg import *; urls = Path('archivable_urls/filteredurls.txt').read_text().split(); archive_urls(urls)"
# Common Crawl seems to be doing a good enough job
#      - name: Archive Internal Pages
#        run: |
#          cd scripts
#          python archive_site.py