-
Notifications
You must be signed in to change notification settings - Fork 82
146 lines (140 loc) · 4.9 KB
/
Copy pathvalidation.yml
File metadata and controls
146 lines (140 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Runs process_dataset.py on all files in the database.
name: Validation
on:
push:
branches:
- main
pull_request:
paths:
# Runs when this file is modified in a PR.
- '.github/workflows/validation.yml'
# One in-flight matrix per PR (or per push to main). Subsequent events
# cancel the prior run so we are not paying twice for the LFS-heavy
# validate_pb / validate_parquet shards.
concurrency:
group: validation-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
ORD_SCHEMA_TAG: v0.6.3
jobs:
validate_pb:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
filter: [
'data/[0-4][0-4]',
'data/[0-4][5-9]',
'data/[0-4][a-f]',
'data/[5-9][0-4]',
'data/[5-9][5-9]',
'data/[5-9][a-f]',
'data/[a-f][0-4]',
'data/[a-f][5-9]',
'data/[a-f][a-f]',
]
steps:
- name: Checkout ord-data
uses: actions/checkout@v4
with:
lfs: false
# .lfsconfig redirects clone/fetch LFS reads to the Hugging Face mirror to
# save GitHub bandwidth, but CI reads from GitHub: on a push to main the
# just-merged objects are not on HF yet, and pulling only this shard keeps
# the transfer tiny instead of fetching the whole dataset in every job.
# The checkout step already configured GitHub credentials for the pull.
#
# matrix.filter (e.g. data/[0-4][0-4]) is intentionally written to be valid
# both as the validate_dataset.py regex below and as an LFS path glob, so it
# doubles as the --include pattern here. (The parquet job needs a separate
# lfs_include because its filter is a lookahead regex, not a glob.)
- name: Fetch LFS shard from GitHub
env:
FILTER: ${{ matrix.filter }}
run: |
git config lfs.url "https://github.com/${GITHUB_REPOSITORY}.git/info/lfs"
git lfs pull --include="${FILTER}/*.pb*"
- name: Checkout ord-schema
uses: actions/checkout@v4
with:
repository: Open-Reaction-Database/ord-schema
ref: ${{ env.ORD_SCHEMA_TAG }}
path: ord-schema
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install ord_schema
run: |
cd "${GITHUB_WORKSPACE}/ord-schema"
python -m pip install --upgrade pip
python -m pip install wheel
python -m pip install .
- name: Validate datasets
env:
FILTER: ${{ matrix.filter }}
run: |
cd "${GITHUB_WORKSPACE}"
python ./ord-schema/ord_schema/scripts/validate_dataset.py \
--input="data/*/*.pb*" \
--filter="${FILTER}" \
--n_jobs=4
validate_parquet:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
# Un-sharded USPTO grants parquet — single ~1.7M-reaction file.
# Saturates the 4-CPU runner via row-group parallelism in
# validate_dataset.py.
- name: uspto
filter: 'ord_dataset-1158e351757f315b93cbcbe7bc55f38e\.parquet$'
lfs_include: 'data/*/ord_dataset-1158e351757f315b93cbcbe7bc55f38e.parquet'
lfs_exclude: ''
# Everything else (negative lookahead on the USPTO parquet id).
- name: other
filter: '^(?!.*ord_dataset-1158e351757f315b93cbcbe7bc55f38e).*\.parquet$'
lfs_include: 'data/*/*.parquet'
lfs_exclude: 'data/*/ord_dataset-1158e351757f315b93cbcbe7bc55f38e.parquet'
steps:
- name: Checkout ord-data
uses: actions/checkout@v4
with:
lfs: false
# See validate_pb: read this shard's LFS objects from GitHub rather than the
# Hugging Face mirror that .lfsconfig points clones at.
- name: Fetch LFS shard from GitHub
env:
LFS_INCLUDE: ${{ matrix.lfs_include }}
LFS_EXCLUDE: ${{ matrix.lfs_exclude }}
run: |
git config lfs.url "https://github.com/${GITHUB_REPOSITORY}.git/info/lfs"
if [[ -n "${LFS_EXCLUDE}" ]]; then
git lfs pull --include="${LFS_INCLUDE}" --exclude="${LFS_EXCLUDE}"
else
git lfs pull --include="${LFS_INCLUDE}"
fi
- name: Checkout ord-schema
uses: actions/checkout@v4
with:
repository: Open-Reaction-Database/ord-schema
ref: ${{ env.ORD_SCHEMA_TAG }}
path: ord-schema
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install ord_schema
run: |
cd "${GITHUB_WORKSPACE}/ord-schema"
python -m pip install --upgrade pip
python -m pip install wheel
python -m pip install .
- name: Validate parquet datasets
env:
FILTER: ${{ matrix.filter }}
run: |
cd "${GITHUB_WORKSPACE}"
python ./ord-schema/ord_schema/scripts/validate_dataset.py \
--input="data/*/*.parquet" \
--filter="${FILTER}" \
--n_jobs=4