ord-data/.github/workflows/validation.yml at main · open-reaction-database/ord-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Runs process_dataset.py on all files in the database.

name: Validation

on:
  push:
    branches:
      - main
  pull_request:
    paths:
      # Runs when this file is modified in a PR.
      - '.github/workflows/validation.yml'

# One in-flight matrix per PR (or per push to main). Subsequent events
# cancel the prior run so we are not paying twice for the LFS-heavy
# validate_pb / validate_parquet shards.
concurrency:
  group: validation-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

env:
  ORD_SCHEMA_TAG: v0.6.3

jobs:
  validate_pb:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        filter: [
          'data/[0-4][0-4]',
          'data/[0-4][5-9]',
          'data/[0-4][a-f]',
          'data/[5-9][0-4]',
          'data/[5-9][5-9]',
          'data/[5-9][a-f]',
          'data/[a-f][0-4]',
          'data/[a-f][5-9]',
          'data/[a-f][a-f]',
        ]
    steps:
    - name: Checkout ord-data
      uses: actions/checkout@v4
      with:
        lfs: false
    # .lfsconfig redirects clone/fetch LFS reads to the Hugging Face mirror to
    # save GitHub bandwidth, but CI reads from GitHub: on a push to main the
    # just-merged objects are not on HF yet, and pulling only this shard keeps
    # the transfer tiny instead of fetching the whole dataset in every job.
    # The checkout step already configured GitHub credentials for the pull.
    #
    # matrix.filter (e.g. data/[0-4][0-4]) is intentionally written to be valid
    # both as the validate_dataset.py regex below and as an LFS path glob, so it
    # doubles as the --include pattern here. (The parquet job needs a separate
    # lfs_include because its filter is a lookahead regex, not a glob.)
    - name: Fetch LFS shard from GitHub
      env:
        FILTER: ${{ matrix.filter }}
      run: |
        git config lfs.url "https://github.com/${GITHUB_REPOSITORY}.git/info/lfs"
        git lfs pull --include="${FILTER}/*.pb*"
    - name: Checkout ord-schema
      uses: actions/checkout@v4
      with:
        repository: Open-Reaction-Database/ord-schema
        ref: ${{ env.ORD_SCHEMA_TAG }}
        path: ord-schema
    - uses: actions/setup-python@v5
      with:
        python-version: '3.11'
    - name: Install ord_schema
      run: |
        cd "${GITHUB_WORKSPACE}/ord-schema"
        python -m pip install --upgrade pip
        python -m pip install wheel
        python -m pip install .
    - name: Validate datasets
      env:
        FILTER: ${{ matrix.filter }}
      run: |
        cd "${GITHUB_WORKSPACE}"
        python ./ord-schema/ord_schema/scripts/validate_dataset.py \
          --input="data/*/*.pb*" \
          --filter="${FILTER}" \
          --n_jobs=4

  validate_parquet:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          # Un-sharded USPTO grants parquet — single ~1.7M-reaction file.
          # Saturates the 4-CPU runner via row-group parallelism in
          # validate_dataset.py.
          - name: uspto
            filter: 'ord_dataset-1158e351757f315b93cbcbe7bc55f38e\.parquet$'
            lfs_include: 'data/*/ord_dataset-1158e351757f315b93cbcbe7bc55f38e.parquet'
            lfs_exclude: ''
          # Everything else (negative lookahead on the USPTO parquet id).
          - name: other
            filter: '^(?!.*ord_dataset-1158e351757f315b93cbcbe7bc55f38e).*\.parquet$'
            lfs_include: 'data/*/*.parquet'
            lfs_exclude: 'data/*/ord_dataset-1158e351757f315b93cbcbe7bc55f38e.parquet'
    steps:
    - name: Checkout ord-data
      uses: actions/checkout@v4
      with:
        lfs: false
    # See validate_pb: read this shard's LFS objects from GitHub rather than the
    # Hugging Face mirror that .lfsconfig points clones at.
    - name: Fetch LFS shard from GitHub
      env:
        LFS_INCLUDE: ${{ matrix.lfs_include }}
        LFS_EXCLUDE: ${{ matrix.lfs_exclude }}
      run: |
        git config lfs.url "https://github.com/${GITHUB_REPOSITORY}.git/info/lfs"
        if [[ -n "${LFS_EXCLUDE}" ]]; then
          git lfs pull --include="${LFS_INCLUDE}" --exclude="${LFS_EXCLUDE}"
        else
          git lfs pull --include="${LFS_INCLUDE}"
        fi
    - name: Checkout ord-schema
      uses: actions/checkout@v4
      with:
        repository: Open-Reaction-Database/ord-schema
        ref: ${{ env.ORD_SCHEMA_TAG }}
        path: ord-schema
    - uses: actions/setup-python@v5
      with:
        python-version: '3.11'
    - name: Install ord_schema
      run: |
        cd "${GITHUB_WORKSPACE}/ord-schema"
        python -m pip install --upgrade pip
        python -m pip install wheel
        python -m pip install .
    - name: Validate parquet datasets
      env:
        FILTER: ${{ matrix.filter }}
      run: |
        cd "${GITHUB_WORKSPACE}"
        python ./ord-schema/ord_schema/scripts/validate_dataset.py \
          --input="data/*/*.parquet" \
          --filter="${FILTER}" \
          --n_jobs=4