Skip to content

Commit 64c2188

Browse files
authored
Merge branch 'awslabs:master' into unfail-anomaly-detection-tests
2 parents 17c5fa1 + f8d2b8d commit 64c2188

25 files changed

Lines changed: 1919 additions & 50 deletions

.github/workflows/base.yml

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,31 @@ jobs:
1212
strategy:
1313
fail-fast: false
1414
matrix:
15-
PYSPARK_VERSION: ["3.1.3", "3.2", "3.3", "3.5"]
15+
include:
16+
- PYSPARK_VERSION: "3.1.3"
17+
PYTHON_VERSION: "3.9"
18+
JAVA_VERSION: "11"
19+
- PYSPARK_VERSION: "3.2"
20+
PYTHON_VERSION: "3.9"
21+
JAVA_VERSION: "11"
22+
- PYSPARK_VERSION: "3.3"
23+
PYTHON_VERSION: "3.9"
24+
JAVA_VERSION: "11"
25+
- PYSPARK_VERSION: "3.5"
26+
PYTHON_VERSION: "3.9"
27+
JAVA_VERSION: "17"
1628

1729
steps:
18-
- uses: actions/checkout@v3
30+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
1931

20-
- uses: actions/setup-python@v2
21-
name: Install Python 3.8
32+
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
2233
with:
23-
python-version: 3.8
34+
python-version: ${{matrix.PYTHON_VERSION}}
2435

25-
- uses: actions/setup-java@v1
26-
name: Setup Java 11
27-
if: startsWith(matrix.PYSPARK_VERSION, '3')
36+
- uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1
2837
with:
29-
java-version: "11"
38+
java-version: ${{matrix.JAVA_VERSION}}
39+
distribution: "temurin"
3040

3141
- name: Running tests with pyspark==${{matrix.PYSPARK_VERSION}}
3242
env:
@@ -35,5 +45,5 @@ jobs:
3545
pip install --upgrade pip
3646
pip install poetry==1.7.1
3747
poetry install
38-
poetry add pyspark==$SPARK_VERSION
39-
poetry run python -m pytest -s tests
48+
poetry run pip install pyspark==$SPARK_VERSION
49+
poetry run python -m pytest -s tests --ignore=tests/test_bot.py

.github/workflows/issue-bot.yml

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
name: PyDeequ Bot
2+
3+
on:
4+
issues:
5+
types: [opened, reopened]
6+
pull_request_target: # Runs base branch code with secrets; safe because bot fetches diff via API, never executes PR code. NEVER add ref: to checkout.
7+
types: [opened, reopened, synchronize]
8+
issue_comment:
9+
types: [created]
10+
workflow_dispatch:
11+
inputs:
12+
issue_number:
13+
description: "Issue/PR number to process"
14+
required: true
15+
dry_run:
16+
description: "Dry run (no writes)"
17+
type: boolean
18+
default: true
19+
20+
# Serialize per issue/PR to prevent duplicate comments
21+
concurrency:
22+
group: bot-${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
23+
cancel-in-progress: false
24+
25+
jobs:
26+
analyze:
27+
runs-on: ubuntu-latest
28+
timeout-minutes: 10
29+
if: >-
30+
(github.event_name == 'workflow_dispatch') ||
31+
(github.actor != 'github-actions[bot]' &&
32+
(github.event.issue.pull_request == null || github.event_name == 'pull_request_target'))
33+
permissions:
34+
contents: read
35+
id-token: write
36+
37+
steps:
38+
- name: Checkout repository
39+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
40+
with:
41+
persist-credentials: false
42+
43+
- name: Configure AWS credentials
44+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
45+
with:
46+
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
47+
aws-region: us-east-1
48+
49+
- name: Set up Python
50+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
51+
with:
52+
python-version: "3.12"
53+
54+
- name: Install dependencies
55+
run: pip install requests==2.33.1 boto3==1.42.94
56+
57+
- name: Run analysis
58+
env:
59+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
60+
GITHUB_REPOSITORY: ${{ github.repository }}
61+
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
62+
EVENT_TYPE: ${{ github.event_name }}
63+
EVENT_ACTION: ${{ github.event.action }}
64+
GITHUB_ACTOR: ${{ github.actor }}
65+
KB_S3_BUCKET: ${{ secrets.KB_S3_BUCKET }}
66+
KB_S3_KEY: ${{ secrets.KB_S3_KEY }}
67+
BEDROCK_MODEL_ID: ${{ secrets.BEDROCK_MODEL_ID }}
68+
GUARDRAIL_ID: ${{ secrets.GUARDRAIL_ID }}
69+
GUARDRAIL_VERSION: ${{ secrets.GUARDRAIL_VERSION }}
70+
ISSUE_CLASSIFY_PROMPT: ${{ secrets.ISSUE_CLASSIFY_PROMPT }}
71+
ISSUE_RESPOND_PROMPT: ${{ secrets.ISSUE_RESPOND_PROMPT }}
72+
PR_FILE_REVIEW_PROMPT: ${{ secrets.PR_FILE_REVIEW_PROMPT }}
73+
FOLLOWUP_PROMPT: ${{ secrets.FOLLOWUP_PROMPT }}
74+
DRY_RUN: ${{ inputs.dry_run || 'false' }}
75+
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
76+
run: python -m issue_bot.main analyze
77+
working-directory: scripts
78+
79+
- name: Upload artifact
80+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
81+
with:
82+
name: bot-result
83+
path: ${{ runner.temp }}/bot_result.json
84+
retention-days: 30
85+
86+
act:
87+
runs-on: ubuntu-latest
88+
timeout-minutes: 1
89+
needs: analyze
90+
permissions:
91+
contents: read
92+
issues: write
93+
pull-requests: write
94+
95+
steps:
96+
- name: Checkout repository
97+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
98+
with:
99+
persist-credentials: false
100+
101+
- name: Set up Python
102+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
103+
with:
104+
python-version: "3.12"
105+
106+
- name: Install dependencies
107+
run: pip install requests==2.33.1 boto3==1.42.94
108+
109+
- name: Download artifact
110+
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
111+
with:
112+
name: bot-result
113+
path: ${{ runner.temp }}
114+
115+
- name: Execute actions
116+
env:
117+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
118+
GITHUB_REPOSITORY: ${{ github.repository }}
119+
ISSUE_NUMBER: ${{ github.event.issue.number || github.event.pull_request.number || inputs.issue_number }}
120+
EVENT_TYPE: ${{ github.event_name }}
121+
EVENT_ACTION: ${{ github.event.action }}
122+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
123+
DRY_RUN: ${{ inputs.dry_run || 'false' }}
124+
ARTIFACT_PATH: ${{ runner.temp }}/bot_result.json
125+
run: python -m issue_bot.main act
126+
working-directory: scripts

.github/workflows/update-kb.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Update Knowledge Base
2+
3+
on:
4+
push:
5+
branches: [master]
6+
paths-ignore:
7+
- '.github/workflows/**'
8+
- 'scripts/issue_bot/**'
9+
- 'tests/test_bot.py'
10+
workflow_dispatch:
11+
12+
jobs:
13+
update-kb:
14+
runs-on: ubuntu-latest
15+
permissions:
16+
contents: read
17+
id-token: write
18+
steps:
19+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20+
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
21+
with:
22+
python-version: "3.12"
23+
- uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
24+
with:
25+
role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
26+
aws-region: us-east-1
27+
- name: Generate and upload KB
28+
run: |
29+
python3 scripts/generate_kb.py > kb.md
30+
SIZE=$(wc -c < kb.md | tr -d ' ')
31+
if [ "$SIZE" -lt 10000 ]; then
32+
echo "ERROR: KB too small ($SIZE bytes), refusing to upload" >&2
33+
exit 1
34+
fi
35+
aws s3 cp kb.md s3://${{ secrets.KB_S3_BUCKET }}/${{ secrets.KB_S3_KEY }} --quiet
36+
echo "Uploaded $SIZE bytes to s3://${{ secrets.KB_S3_BUCKET }}/${{ secrets.KB_S3_KEY }}"

docs/profiles.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ Here are the current supported functionalities of Profiles.
2020
| | useSparkSession | |
2121
| ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done |
2222
| | property: profiles | Done |
23+
| | property: numRecords | Done |
2324
| StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
2425
| NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |

poetry.lock

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pydeequ/profiles.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from pydeequ.analyzers import KLLParameters
88
from pydeequ.metrics import BucketDistribution
99
from pydeequ.pandas_utils import ensure_pyspark_df
10-
from enum import Enum
1110
from pydeequ.scala_utils import (
1211
get_or_else_none,
1312
java_list_to_python_list,
@@ -239,6 +238,7 @@ def __init__(self, spark_session: SparkSession):
239238
self._sc = spark_session.sparkContext
240239
self._jvm = spark_session._jvm
241240
self._profiles = []
241+
self._numRecords = 0
242242
self.columnProfileClasses = {
243243
"StandardColumnProfile": StandardColumnProfile,
244244
"StringColumnProfile": StandardColumnProfile,
@@ -251,11 +251,12 @@ def _columnProfilesFromColumnRunBuilderRun(self, run):
251251
Produces a Java profile based on the designated column
252252
253253
:param run: columnProfilerRunner result
254-
:return: a setter for columnProfilerRunner result
254+
:return self: a setter for columnProfilerRunner result
255255
"""
256256
self._run_result = run
257257
profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils
258258
self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map}
259+
self._numRecords = run.numRecords()
259260
return self
260261

261262
@property
@@ -267,6 +268,15 @@ def profiles(self):
267268
"""
268269
return self._profiles
269270

271+
@property
272+
def numRecords(self) -> int:
273+
"""
274+
A getter for the number of records
275+
276+
:return int: number of records
277+
"""
278+
return self._numRecords
279+
270280
def _columnProfileBuilder(self, column, java_column_profile):
271281
"""Factory function for ColumnProfile
272282
Returns a Java profile based on the designated column

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ classifiers = [
3131
python = ">=3.8,<4"
3232
numpy = ">=1.14.1"
3333
pandas = ">=0.23.0"
34-
pyspark = { version = ">=2.4.7, <3.3.0", optional = true }
34+
pyspark = { version = ">=2.4.7,<4.0.0", optional = true }
3535

3636
[tool.poetry.dev-dependencies]
3737
pytest = "^6.2.4"

scripts/generate_kb.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python3
2+
"""Generate knowledge base for the PyDeequ bot from repository source.
3+
4+
Usage (from repo root):
5+
python3 scripts/generate_kb.py > kb.md
6+
"""
7+
8+
import os
9+
from pathlib import Path
10+
11+
REPO_ROOT = Path(".")
12+
SRC_DIR = REPO_ROOT / "pydeequ"
13+
TESTS_DIR = REPO_ROOT / "tests"
14+
README = REPO_ROOT / "README.md"
15+
PYPROJECT = REPO_ROOT / "pyproject.toml"
16+
17+
MAX_FILE_CHARS = 8000
18+
MAX_TOTAL_CHARS = 500000
19+
20+
21+
def read_safe(path, max_chars=None):
22+
try:
23+
text = path.read_text(errors="replace")
24+
if max_chars and len(text) > max_chars:
25+
text = text[:max_chars] + "\n... (truncated)"
26+
return text
27+
except Exception:
28+
return ""
29+
30+
31+
def main():
32+
parts = []
33+
total = 0
34+
35+
# README
36+
if README.exists():
37+
content = read_safe(README, MAX_FILE_CHARS)
38+
parts.append(f"# PyDeequ Knowledge Base\n\n## README\n\n{content}")
39+
total += len(content)
40+
41+
# pyproject.toml
42+
if PYPROJECT.exists():
43+
content = read_safe(PYPROJECT, 3000)
44+
parts.append(f"## Build Configuration (pyproject.toml)\n\n```toml\n{content}\n```")
45+
total += len(content)
46+
47+
# Source files
48+
if SRC_DIR.exists():
49+
parts.append("## Source Code Reference\n")
50+
for py_file in sorted(SRC_DIR.rglob("*.py")):
51+
if total >= MAX_TOTAL_CHARS:
52+
parts.append("\n... (KB size limit reached)")
53+
break
54+
rel = py_file.relative_to(REPO_ROOT)
55+
content = read_safe(py_file, MAX_FILE_CHARS)
56+
if content.strip():
57+
section = f"### `{rel}`\n\n```python\n{content}\n```\n"
58+
parts.append(section)
59+
total += len(section)
60+
61+
# Test file listing (names only)
62+
if TESTS_DIR.exists():
63+
parts.append("## Test Files\n")
64+
for test_file in sorted(TESTS_DIR.rglob("*.py")):
65+
if total >= MAX_TOTAL_CHARS:
66+
break
67+
rel = test_file.relative_to(REPO_ROOT)
68+
lines = len(test_file.read_text(errors="replace").splitlines())
69+
parts.append(f"- `{rel}` ({lines} lines)")
70+
71+
print("\n\n".join(parts))
72+
73+
74+
if __name__ == "__main__":
75+
main()

scripts/issue_bot/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__/

scripts/issue_bot/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

0 commit comments

Comments
 (0)