Skip to content

Commit 63af152

Browse files
committed
Merge remote-tracking branch 'origin/master' into v2_rewrite
# Conflicts: # .github/workflows/base.yml # poetry.lock # pyproject.toml
2 parents 14db64b + 6880041 commit 63af152

31 files changed

Lines changed: 4419 additions & 1120 deletions

.github/workflows/auto-approve.yml

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
name: Auto-Approve Clean PRs
2+
3+
on:
4+
workflow_run:
5+
workflows: [".github/workflows/base.yml", "PyDeequ Bot"]
6+
types: [completed]
7+
8+
permissions:
9+
pull-requests: write
10+
actions: read
11+
12+
jobs:
13+
approve:
14+
runs-on: ubuntu-latest
15+
if: github.event.workflow_run.event == 'pull_request' || github.event.workflow_run.event == 'pull_request_target'
16+
timeout-minutes: 2
17+
18+
steps:
19+
- name: Find PR and check both conditions
20+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
21+
with:
22+
script: |
23+
const sha = context.payload.workflow_run.head_sha;
24+
const owner = context.repo.owner;
25+
const repo = context.repo.repo;
26+
27+
// Find the PR for this SHA
28+
let prNumber = null;
29+
const prs = context.payload.workflow_run.pull_requests;
30+
if (prs && prs.length > 0) {
31+
prNumber = prs[0].number;
32+
} else {
33+
const {data: searchResult} = await github.rest.pulls.list({
34+
owner, repo, state: 'open', sort: 'updated', direction: 'desc', per_page: 30
35+
});
36+
const match = searchResult.find(pr => pr.head.sha === sha);
37+
if (match) {
38+
prNumber = match.number;
39+
}
40+
}
41+
42+
if (!prNumber) {
43+
core.info(`No open PR found for SHA ${sha}, skipping`);
44+
return;
45+
}
46+
47+
core.info(`Found PR #${prNumber} for SHA ${sha}`);
48+
49+
// Verify the PR head SHA still matches (no new push since trigger)
50+
const {data: pr} = await github.rest.pulls.get({
51+
owner, repo, pull_number: prNumber
52+
});
53+
if (pr.head.sha !== sha) {
54+
core.info(`PR head ${pr.head.sha} differs from trigger SHA ${sha} — new push arrived, skipping`);
55+
return;
56+
}
57+
58+
// Condition 1: CI must have passed for this SHA
59+
const {data: workflowRuns} = await github.rest.actions.listWorkflowRunsForRepo({
60+
owner, repo, head_sha: sha, status: 'completed'
61+
});
62+
const ciRun = workflowRuns.workflow_runs.find(r =>
63+
r.name === '.github/workflows/base.yml' && r.conclusion === 'success'
64+
);
65+
if (!ciRun) {
66+
core.info(`CI has not passed for SHA ${sha}, skipping`);
67+
return;
68+
}
69+
70+
// Condition 2: Bot must have posted a clean review for this SHA
71+
const {data: reviews} = await github.rest.pulls.listReviews({
72+
owner, repo, pull_number: prNumber
73+
});
74+
75+
const CLEAN_MARKER = '<!-- deequ-bot:clean -->';
76+
77+
const latestBot = reviews
78+
.filter(r => r.user.login === 'github-actions[bot]')
79+
.sort((a, b) => new Date(b.submitted_at) - new Date(a.submitted_at))[0];
80+
81+
if (!latestBot || !latestBot.body.includes(CLEAN_MARKER) || latestBot.commit_id !== sha) {
82+
core.info('Bot has not posted a clean review for this SHA, skipping');
83+
return;
84+
}
85+
86+
// Both conditions met — check for existing approval to prevent doubles
87+
const botApprovals = reviews.filter(r =>
88+
r.user.login === 'github-actions[bot]' && r.state === 'APPROVED'
89+
);
90+
if (botApprovals.length > 0) {
91+
core.info('Bot already approved this PR, skipping');
92+
return;
93+
}
94+
95+
// Approve
96+
core.info(`Approving PR #${prNumber}: bot review clean + CI passed for SHA ${sha}`);
97+
await github.rest.pulls.createReview({
98+
owner, repo, pull_number: prNumber,
99+
event: 'APPROVE',
100+
body: `No issues found and CI is passing. Auto-approved.\n\n---\n*Generated by AI — human merge required.*`
101+
});

.github/workflows/base.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@ jobs:
1414
runs-on: ubuntu-latest
1515

1616
steps:
17-
- uses: actions/checkout@v4
17+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1818

19-
- uses: actions/setup-python@v5
19+
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
2020
name: Install Python 3.12
2121
with:
2222
python-version: "3.12"
2323

24-
- uses: actions/setup-java@v4
24+
- uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1
2525
name: Setup Java 17
2626
with:
27-
distribution: "corretto"
27+
distribution: "temurin"
2828
java-version: "17"
2929

3030
- name: Download Spark 3.5

.github/workflows/issue-bot.yml

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
name: PyDeequ Bot
2+
3+
on:
4+
issues:
5+
types: [opened, reopened]
6+
pull_request_target: # Runs base branch code with secrets; safe because bot fetches diff via API, never executes PR code. NEVER add ref: to checkout.
7+
types: [opened, reopened, synchronize]
8+
issue_comment:
9+
types: [created]
10+
workflow_dispatch:
11+
inputs:
12+
issue_number:
13+
description: "Issue/PR number to process"
14+
required: true
15+
dry_run:
16+
description: "Dry run (no writes)"
17+
type: boolean
18+
default: true
19+
20+
# Serialize per issue/PR to prevent duplicate comments
21+
concurrency:
22+
group: bot-${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
23+
cancel-in-progress: false
24+
25+
jobs:
26+
analyze:
27+
runs-on: ubuntu-latest
28+
timeout-minutes: 10
29+
if: >-
30+
(github.event_name == 'workflow_dispatch') ||
31+
(github.actor != 'github-actions[bot]' &&
32+
(github.event.issue.pull_request == null || github.event_name == 'pull_request_target'))
33+
permissions:
34+
contents: read
35+
id-token: write
36+
37+
steps:
38+
- name: Checkout repository
39+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
40+
with:
41+
persist-credentials: false
42+
43+
- name: Configure AWS credentials
44+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
45+
with:
46+
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
47+
aws-region: us-east-1
48+
49+
- name: Set up Python
50+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
51+
with:
52+
python-version: "3.12"
53+
54+
- name: Install dependencies
55+
run: pip install requests==2.33.1 boto3==1.42.94
56+
57+
- name: Run analysis
58+
env:
59+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
60+
GITHUB_REPOSITORY: ${{ github.repository }}
61+
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
62+
EVENT_TYPE: ${{ github.event_name }}
63+
EVENT_ACTION: ${{ github.event.action }}
64+
EVENT_BEFORE: ${{ github.event.before }}
65+
EVENT_AFTER: ${{ github.event.pull_request.head.sha || github.event.after }}
66+
GITHUB_ACTOR: ${{ github.actor }}
67+
KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
68+
KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
69+
BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
70+
GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
71+
GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
72+
SM_ISSUE_CLASSIFY_PROMPT: pydeequ-bot/issue-classify-prompt
73+
SM_ISSUE_RESPOND_PROMPT: pydeequ-bot/issue-respond-prompt
74+
SM_PR_FILE_REVIEW_PROMPT: pydeequ-bot/pr-file-review-prompt
75+
SM_FOLLOWUP_PROMPT: pydeequ-bot/followup-prompt
76+
CODEBASE_SRC_DIR: pydeequ
77+
CODEBASE_FILE_EXT: .py
78+
DRY_RUN: ${{ inputs.dry_run || 'false' }}
79+
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
80+
run: python -m issue_bot.main analyze
81+
working-directory: scripts
82+
83+
- name: Upload artifact
84+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
85+
with:
86+
name: bot-result
87+
path: ${{ runner.temp }}/bot_result.json
88+
retention-days: 30
89+
90+
act:
91+
runs-on: ubuntu-latest
92+
timeout-minutes: 1
93+
needs: analyze
94+
permissions:
95+
contents: read
96+
issues: write
97+
pull-requests: write
98+
99+
steps:
100+
- name: Checkout repository
101+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
102+
with:
103+
persist-credentials: false
104+
105+
- name: Set up Python
106+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
107+
with:
108+
python-version: "3.12"
109+
110+
- name: Install dependencies
111+
run: pip install requests==2.33.1 boto3==1.42.94
112+
113+
- name: Download artifact
114+
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
115+
with:
116+
name: bot-result
117+
path: ${{ runner.temp }}
118+
119+
- name: Execute actions
120+
env:
121+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
122+
GITHUB_REPOSITORY: ${{ github.repository }}
123+
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
124+
EVENT_TYPE: ${{ github.event_name }}
125+
EVENT_ACTION: ${{ github.event.action }}
126+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
127+
DRY_RUN: ${{ inputs.dry_run || 'false' }}
128+
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
129+
run: python -m issue_bot.main act
130+
working-directory: scripts

.github/workflows/stale.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Manage Stale Issues and PRs
2+
3+
on:
4+
schedule:
5+
- cron: '0 9 * * MON'
6+
workflow_dispatch:
7+
8+
permissions:
9+
issues: write
10+
pull-requests: write
11+
12+
jobs:
13+
stale:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
17+
with:
18+
days-before-stale: 60
19+
days-before-close: 14
20+
stale-issue-label: 'stale'
21+
stale-pr-label: 'stale'
22+
stale-issue-message: >
23+
This issue has been inactive for 60 days. It will be closed in 14 days
24+
if there is no further activity. If this is still relevant, please comment
25+
to keep it open.
26+
stale-pr-message: >
27+
This PR has been inactive for 60 days. It will be closed in 14 days
28+
if there is no further activity. If you are still working on this,
29+
please push an update or comment to keep it open.
30+
close-issue-message: >
31+
Closed due to inactivity. Feel free to reopen if this is still relevant.
32+
close-pr-message: >
33+
Closed due to inactivity. Feel free to reopen if you'd like to continue this work.
34+
exempt-issue-labels: 'bug,enhancement,help-wanted'
35+
exempt-pr-labels: 'help-wanted'
36+
operations-per-run: 50

.github/workflows/update-kb.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Update Knowledge Base
2+
3+
on:
4+
push:
5+
branches: [master]
6+
paths-ignore:
7+
- '.github/workflows/**'
8+
- 'scripts/issue_bot/**'
9+
- 'tests/test_bot.py'
10+
workflow_dispatch:
11+
12+
jobs:
13+
update-kb:
14+
runs-on: ubuntu-latest
15+
permissions:
16+
contents: read
17+
id-token: write
18+
steps:
19+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20+
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
21+
with:
22+
python-version: "3.12"
23+
- uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
24+
with:
25+
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
26+
aws-region: us-east-1
27+
- name: Generate and upload KB
28+
run: |
29+
python3 scripts/generate_kb.py > kb.md
30+
SIZE=$(wc -c < kb.md | tr -d ' ')
31+
if [ "$SIZE" -lt 10000 ]; then
32+
echo "ERROR: KB too small ($SIZE bytes), refusing to upload" >&2
33+
exit 1
34+
fi
35+
aws s3 cp kb.md s3://${{ secrets.KB_S3_BUCKET }}/${{ secrets.KB_S3_KEY }} --quiet
36+
echo "Uploaded $SIZE bytes to s3://${{ secrets.KB_S3_BUCKET }}/${{ secrets.KB_S3_KEY }}"

docs/checks.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Here are the current supported functionalities of Checks.
1515
| | areAnyComplete(columns) | Done |
1616
| | haveAnyCompleteness(columns, assertion) | Done |
1717
| | isUnique(column) | Done |
18-
| | isPrimaryKey(column, *columns) | Not Implemented |
18+
| | isPrimaryKey(column, *columns) | Done |
1919
| | hasUniqueness(columns, assertion) | Done |
2020
| | hasDistinctness(columns, assertion) | Done |
2121
| | hasUniqueValueRatio(columns, assertion) | Done |

docs/profiles.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,7 @@ Here are the current supported functionalities of Profiles.
2020
| | useSparkSession | |
2121
| ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done |
2222
| | property: profiles | Done |
23+
| | property: numRecords | Done |
2324
| StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
25+
| StringColumnProfile | StringColumnProfile(spark_session, column, java_column_profile) | Done |
2426
| NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |

0 commit comments

Comments
 (0)