static-files/.github/workflows/update-and-process-tranco.yml at main · sublime-security/static-files · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
name: Update and Process Tranco CSV

on:
  schedule:
    # Runs at 00:00 UTC on the 1st of every month
    - cron: '0 0 1 * *'
  # Allow manual triggering
  workflow_dispatch:

permissions:
  contents: write
  pull-requests: write

jobs:
  update-and-process-tranco:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683  # v4.2.2
        with:
          fetch-depth: 1

      - name: Set up date variables
        id: date
        run: |
          echo "today=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
          echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> "$GITHUB_OUTPUT"

      - name: Fetch Tranco list ID
        id: tranco-id
        run: |
          # Maximum retry count
          MAX_RETRIES=5
          RETRY_COUNT=0
          SUCCESS=false

          while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ "$SUCCESS" = "false" ]; do
            # Save the curl verbose output to a log file and the response to another file
            echo "Attempt $(($RETRY_COUNT + 1)) of $MAX_RETRIES: Fetching Tranco list ID..."

            # Use -w to capture HTTP status code
            HTTP_STATUS=$(curl -s -o tranco_response.txt -w "%{http_code}" \
              --retry 3 --retry-delay 5 --retry-max-time 120 \
              --connect-timeout 10 --max-time 60 \
              https://tranco-list.eu/top-1m-id)

            echo "HTTP Status Code: $HTTP_STATUS"

            # Check if HTTP status code is 200 (OK)
            if [ "$HTTP_STATUS" -eq 200 ]; then
              # Get the response
              TRANCO_ID=$(cat tranco_response.txt)
              echo "Raw response: '$TRANCO_ID'"

              # Check if we got a valid ID (non-empty and contains alphanumeric characters)
              if [[ ! -z "$TRANCO_ID" && "$TRANCO_ID" =~ ^[A-Za-z0-9]+$ ]]; then
                echo "id=$TRANCO_ID" >> "$GITHUB_OUTPUT"
                echo "Successfully fetched Tranco list ID: $TRANCO_ID"
                SUCCESS=true
              else
                echo "Received invalid Tranco ID: '$TRANCO_ID' despite HTTP 200"
                RETRY_COUNT=$((RETRY_COUNT + 1))

                if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
                  echo "Retrying in 10 seconds..."
                  sleep 10
                fi
              fi
            else
              echo "Request failed with HTTP status code: $HTTP_STATUS"
              RETRY_COUNT=$((RETRY_COUNT + 1))

              if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
                echo "Retrying in 10 seconds..."
                sleep 10
              fi
            fi
          done

          if [ "$SUCCESS" = "false" ]; then
            echo "Failed to fetch Tranco list ID after $MAX_RETRIES attempts"
            exit 1
          fi

      - name: Download Tranco list
        id: download
        env:
          STEPS_TRANCO_ID_OUTPUTS_ID: ${{ steps.tranco-id.outputs.id }}
        run: |
          # Maximum retry count
          MAX_RETRIES=5
          RETRY_COUNT=0
          SUCCESS=false

          while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ "$SUCCESS" = "false" ]; do
            echo "Attempt $(($RETRY_COUNT + 1)) of $MAX_RETRIES: Downloading Tranco list ${STEPS_TRANCO_ID_OUTPUTS_ID}..."

            # Use -w to capture HTTP status code
            HTTP_STATUS=$(curl -s -L -o tranco.zip -w "%{http_code}" \
              --retry 3 --retry-delay 10 --retry-max-time 300 \
              --connect-timeout 15 --max-time 300 \
              "https://tranco-list.eu/download_daily/${STEPS_TRANCO_ID_OUTPUTS_ID}")

            echo "HTTP Status Code: $HTTP_STATUS"

            # Check if HTTP status code is 200 (OK)
            if [ "$HTTP_STATUS" -eq 200 ]; then
              # Check if file was actually downloaded and has content
              if [ -s tranco.zip ]; then
                echo "Successfully downloaded Tranco list ${STEPS_TRANCO_ID_OUTPUTS_ID}"
                SUCCESS=true
              else
                echo "Downloaded file is empty despite HTTP 200"
                RETRY_COUNT=$((RETRY_COUNT + 1))

                if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
                  echo "Retrying in 15 seconds..."
                  sleep 15
                fi
              fi
            else
              echo "Download failed with HTTP status code: $HTTP_STATUS"
              RETRY_COUNT=$((RETRY_COUNT + 1))

              if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
                echo "Retrying in 15 seconds..."
                sleep 15
              fi
            fi
          done

          if [ "$SUCCESS" = "false" ]; then
            echo "Failed to download Tranco list after $MAX_RETRIES attempts"
            exit 1
          fi

      - name: Extract Tranco list
        id: extract
        run: |
          if unzip -o tranco.zip; then
            if [ -f "top-1m.csv" ]; then
              echo "Successfully extracted Tranco list"
            else
              echo "Expected file 'top-1m.csv' not found in the zip archive"
              ls -la
              exit 1
            fi
          else
            echo "Failed to extract zip file"
            exit 1
          fi

      - name: Validate Tranco list
        id: validate_tranco_list
        run: |
          # Count lines to verify it's exactly 1 million
          LINE_COUNT=$(wc -l < top-1m.csv)

          # Verify the first line starts with "1,"
          FIRST_LINE=$(head -n 1 top-1m.csv)

          # Verify the last line starts with "1000000,"
          LAST_LINE=$(tail -n 1 top-1m.csv)

          if [ "$LINE_COUNT" -eq 1000000 ] && [[ "$FIRST_LINE" =~ ^1, ]] && [[ "$LAST_LINE" =~ ^1000000, ]]; then
            echo "File validation passed:"
            echo "- Exactly 1,000,000 lines"
            echo "- First line: $FIRST_LINE"
            echo "- Last line: $LAST_LINE"

            # Move the file to desired location
            mv top-1m.csv tranco.csv
          else
            echo "File validation failed:"
            echo "- Line count: $LINE_COUNT (expected 1,000,000)"
            echo "- First line: $FIRST_LINE (should start with '1,')"
            echo "- Last line: $LAST_LINE (should start with '1000000,')"
            exit 1
          fi

      - name: Remove Public Suffix List entries
        id: remove_psl_entries
        run: |
          echo "Fetching Public Suffix List..."

          # Download and process the Public Suffix List with error handling
          # Remove comments, empty lines, wildcards, exceptions, and leading/trailing whitespace
          if ! curl -sL https://publicsuffix.org/list/public_suffix_list.dat | \
            grep -v '^//' | \
            grep -v '^$' | \
            grep -v '^\*' | \
            grep -v '^!' | \
            sed 's/^[ \t]*//;s/[ \t]*$//' > psl.txt; then
            echo "Error: Failed to download or process Public Suffix List"
            exit 1
          fi

          # Verify the PSL file has content
          if [ ! -s psl.txt ]; then
            echo "Error: Public Suffix List file is empty"
            exit 1
          fi

          PSL_COUNT=$(wc -l < psl.txt | tr -d ' ')
          echo "Loaded $PSL_COUNT public suffixes from PSL"

          echo ""
          echo "Removing PSL entries from tranco.csv..."

          # First, normalize line endings by removing all carriage returns
          # This handles both Unix (\n) and Windows (\r\n) line endings uniformly
          tr -d '\r' < tranco.csv > tranco_normalized.csv
          mv tranco_normalized.csv tranco.csv

          # Build a single awk script for efficient filtering
          # This processes the file in one pass instead of O(n×m) complexity
          # Using exact string matching avoids regex escaping issues
          awk 'BEGIN {
            # Read all PSL entries into an associative array
            while ((getline line < "psl.txt") > 0) {
              if (line != "") {
                psl[line] = 1
              }
            }
            close("psl.txt")
            removed = 0
          }
          {
            # Extract domain from "rank,domain" format
            n = index($0, ",")
            if (n > 0) {
              domain = substr($0, n + 1)

              # Check if domain in PSL (exact string match)
              if (domain in psl) {
                removed++
                print "✓ Removed: " domain > "/dev/stderr"
              } else {
                # Keep this line
                print $0
              }
            } else {
              # Malformed line, keep it
              print $0
            }
          }
          END {
            # Write count to a separate file for easy extraction
            print removed > "removal_count.txt"
          }' tranco.csv > tranco_filtered.csv

          # Read the removal count
          TOTAL_REMOVED=$(cat removal_count.txt)

          # Replace original file with filtered version
          mv tranco_filtered.csv tranco.csv

          # Clean up
          rm -f removal_count.txt psl.txt
          # Report final statistics
          FINAL_COUNT=$(wc -l < tranco.csv | tr -d ' ')
          echo ""
          echo "=== Summary ==="
          echo "PSL entries checked: $PSL_COUNT"
          echo "PSL entries found in Tranco: $TOTAL_REMOVED"
          echo "Total domains removed: $TOTAL_REMOVED"
          echo "Final line count: $FINAL_COUNT"

      - name: Set configuration for top files
        id: set_config_top
        run: |
          # Configuration is defined here
          CONFIG='[{"count": 10000, "filename": "tranco_top_10k.csv"}, {"count": 50000, "filename": "tranco_top_50k.csv"}]'
          echo "CONFIG=$CONFIG" >> "$GITHUB_ENV"
          echo "Using configuration: $CONFIG"

      - name: Validate manifest.json
        id: validate_manifest
        run: |
          # Check if manifest.json exists
          if [ ! -f "manifest.json" ]; then
            echo "Error: manifest.json file not found"
            exit 1
          fi

          # Create a temporary file to store validation results
          TEMP_FILE=$(mktemp)

          # Check each output file in the configuration
          echo "$CONFIG" | jq -c '.[]' | while read -r config; do
            filename=$(echo "$config" | jq -r '.filename')

            # Check if the filename is in manifest.json
            if ! grep -q "\"file\": \"$filename\"" manifest.json; then
              echo "Error: $filename is not defined in manifest.json"
              echo "VALIDATION_FAILED=true" >> "$TEMP_FILE"
            else
              echo "✓ $filename is defined in manifest.json"
            fi
          done

          # Exit if any file is not defined in manifest.json
          if grep -q "VALIDATION_FAILED=true" "$TEMP_FILE"; then
            echo "One or more output files are not defined in manifest.json. Please update manifest.json first."
            rm "$TEMP_FILE"
            exit 1
          fi

          rm "$TEMP_FILE"

      - name: Process Tranco CSV
        id: process
        run: |
          echo "Processing Tranco CSV with configuration: $CONFIG"

          # Check if tranco.csv exists
          if [ ! -f "tranco.csv" ]; then
            echo "Error: tranco.csv file not found"
            exit 1
          fi

          # Parse the JSON configuration and process each output
          echo "$CONFIG" | jq -c '.[]' | while read -r config; do
            count=$(echo "$config" | jq -r '.count')
            filename=$(echo "$config" | jq -r '.filename')

            if [ -z "$count" ] || [ -z "$filename" ]; then
              echo "Skipping invalid configuration: $config"
              continue
            fi

            # Get exactly the requested number of lines from the file
            head -n "$count" tranco.csv > "$filename"

            lines=$(wc -l < "$filename")
            echo "Successfully created $filename with $lines rows"
          done

      - name: Configure Git
        run: |
          git config --local user.email "hello@sublimesecurity.com"
          git config --local user.name "Tranco Process Bot"

      - name: Create and push branch
        id: create-branch
        env:
          STEPS_DATE_OUTPUTS_TODAY: ${{ steps.date.outputs.today }}
          STEPS_DATE_OUTPUTS_TIMESTAMP: ${{ steps.date.outputs.timestamp }}
          STEPS_TRANCO_ID_OUTPUTS_ID: ${{ steps.tranco-id.outputs.id }}
        run: |
          # Create a unique branch name with timestamp
          BRANCH_NAME="tranco_update-${STEPS_DATE_OUTPUTS_TODAY}-${STEPS_DATE_OUTPUTS_TIMESTAMP}"
          echo "branch_name=$BRANCH_NAME" >> "$GITHUB_OUTPUT"

          git checkout -b "$BRANCH_NAME"
          git add tranco.csv

          # Parse the configuration to get the filenames
          echo "$CONFIG" | jq -c '.[]' | while read -r config; do
            filename=$(echo "$config" | jq -r '.filename')
            # Add each generated file individually
            git add "$filename"
          done

          git commit -m "Update Tranco list for ${STEPS_DATE_OUTPUTS_TODAY} (ID: ${STEPS_TRANCO_ID_OUTPUTS_ID})"
          git push origin "$BRANCH_NAME"

      - name: Create Pull Request
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          STEPS_DATE_OUTPUTS_TODAY: ${{ steps.date.outputs.today }}
          STEPS_TRANCO_ID_OUTPUTS_ID: ${{ steps.tranco-id.outputs.id }}
        run: |
          gh pr create \
            --title "Update Tranco list and derived files - ${STEPS_DATE_OUTPUTS_TODAY}" \
            --body "This PR updates the Tranco top 1 million domains list and all derived files.

            - Date: ${STEPS_DATE_OUTPUTS_TODAY}
            - Tranco List ID: ${STEPS_TRANCO_ID_OUTPUTS_ID}
            - List URL: https://tranco-list.eu/list/${STEPS_TRANCO_ID_OUTPUTS_ID}
            - Automated update via GitHub Actions" \
            --head "$BRANCH_NAME" \
            --base "main"