Skip to content

Commit 8e21a44

Browse files
committed
Changed name of allow list for better compatibility with documentation.
Added better error messaging. Finalized example notebook. Updated corresponding tests. --- X-AI-Prompt: Refactor the allos list naming, and update testing to ensure it continues to work X-AI-Tool: Kiro CLI Sisyphus
1 parent 27e1d91 commit 8e21a44

File tree

4 files changed

+74
-77
lines changed

4 files changed

+74
-77
lines changed

sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_group_manager.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import json
66
import logging
7+
from collections import Counter
78
from typing import Dict, List, Optional
89

910
from pydantic import model_validator
@@ -29,7 +30,7 @@
2930
from pyiceberg.catalog import load_catalog
3031
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
3132
from sagemaker.mlops.feature_store.feature_utils import (
32-
_APPROVED_ICEBERG_PROPERTIES,
33+
_ALLOWED_ICEBERG_PROPERTIES,
3334
_ICEBERG_PERMISSIONS_ERROR_MESSAGE,
3435
)
3536

@@ -57,16 +58,18 @@ class IcebergProperties(Base):
5758
def validate_property_keys(self):
5859
if self.properties is None:
5960
return self
60-
invalid_keys = set(self.properties.keys()) - _APPROVED_ICEBERG_PROPERTIES
61+
invalid_keys = set(self.properties.keys()) - _ALLOWED_ICEBERG_PROPERTIES
6162
if invalid_keys:
6263
raise ValueError(
6364
f"Invalid iceberg properties: {invalid_keys}. "
64-
f"Approved properties are: {_APPROVED_ICEBERG_PROPERTIES}"
65+
f"Allowed properties are: {_ALLOWED_ICEBERG_PROPERTIES}"
6566
)
6667
# Check for no duplicate keys
67-
if len(set(self.properties.keys())) != len(self.properties.keys()):
68+
keys = list(self.properties.keys())
69+
duplicates = {k for k, count in Counter(keys).items() if count > 1}
70+
if duplicates:
6871
raise ValueError(
69-
f"Invalid duplicate properties. Please only have 1 of each property."
72+
f"Invalid duplicate properties: {duplicates}. Please only have 1 of each property."
7073
)
7174
return self
7275

@@ -232,17 +235,19 @@ def _update_iceberg_properties(
232235
"iceberg_properties must contain at least one property to update"
233236
)
234237

235-
invalid_keys = set(iceberg_properties.properties.keys()) - _APPROVED_ICEBERG_PROPERTIES
238+
invalid_keys = set(iceberg_properties.properties.keys()) - _ALLOWED_ICEBERG_PROPERTIES
236239
if invalid_keys:
237240
raise ValueError(
238241
f"Invalid iceberg properties: {invalid_keys}. "
239-
f"Approved properties are: {_APPROVED_ICEBERG_PROPERTIES}"
242+
f"Allowed properties are: {_ALLOWED_ICEBERG_PROPERTIES}"
240243
)
241244

242245
# Check for no duplicate keys
243-
if len(set(iceberg_properties.properties.keys())) != len(iceberg_properties.properties.keys()):
246+
keys = list(iceberg_properties.properties.keys())
247+
duplicates = {k for k, count in Counter(keys).items() if count > 1}
248+
if duplicates:
244249
raise ValueError(
245-
f"Invalid duplicate properties. Please only have 1 of each property."
250+
f"Invalid duplicate properties: {duplicates}. Please only have 1 of each property."
246251
)
247252

248253
result = self._get_iceberg_properties(session=session, region=region)
@@ -331,12 +336,12 @@ def get(
331336
if include_iceberg_properties:
332337
result = feature_group._get_iceberg_properties(session=session, region=region)
333338
all_properties = result["properties"]
334-
approved_properties = {
339+
allowed_properties = {
335340
k: v for k, v in all_properties.items()
336-
if k in _APPROVED_ICEBERG_PROPERTIES
341+
if k in _ALLOWED_ICEBERG_PROPERTIES
337342
}
338343
feature_group.iceberg_properties = IcebergProperties(
339-
properties=approved_properties
344+
properties=allowed_properties
340345
)
341346

342347
return feature_group

sagemaker-mlops/src/sagemaker/mlops/feature_store/feature_utils.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
_INTEGER_TYPES = {"int_", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"}
5050
_FLOAT_TYPES = {"float_", "float16", "float32", "float64"}
5151

52-
_APPROVED_ICEBERG_PROPERTIES = {
52+
_ALLOWED_ICEBERG_PROPERTIES = {
5353
"write.metadata.delete-after-commit.enabled",
5454
"write.metadata.previous-versions-max",
5555
"history.expire.max-snapshot-age-ms",
@@ -66,13 +66,13 @@
6666
"write.target-file-size-bytes"
6767
}
6868

69-
_ICEBERG_PERMISSIONS_ERROR_MESSAGE = (
70-
"If this feature group uses Lake Formation governance, ensure you have "
71-
"SELECT, DESCRIBE, and ALTER permissions on the table in Lake Formation, "
72-
"in addition to IAM permissions.\n"
73-
"If this feature group uses IAM governance, ensure your role has "
74-
"glue:GetTable and glue:UpdateTable permissions on the feature group's Glue table."
75-
)
69+
_ICEBERG_PERMISSIONS_ERROR_MESSAGE = (
70+
"If this feature group uses Lake Formation governance, ensure you have "
71+
"SELECT, DESCRIBE, and ALTER permissions on the table in Lake Formation, "
72+
"in addition to IAM permissions.\n"
73+
"If this feature group uses IAM governance, ensure your role has "
74+
"glue:GetTable and glue:UpdateTable permissions on the feature group's Glue table."
75+
)
7676

7777
def _get_athena_client(session: Session):
7878
"""Get Athena client from session."""

sagemaker-mlops/tests/unit/sagemaker/mlops/feature_store/test_iceberg_properties.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ def test_properties_can_be_set(self):
2222
config = IcebergProperties(properties=props)
2323
assert config.properties == props
2424

25-
def test_valid_approved_keys_accepted(self):
26-
"""Test that all approved keys are accepted."""
25+
def test_valid_allowed_keys_accepted(self):
26+
"""Test that all allowed keys are accepted."""
2727
props = {
2828
"write.target-file-size-bytes": "536870912",
2929
"write.metadata.delete-after-commit.enabled": "true",
@@ -64,11 +64,11 @@ def test_duplicate_keys_raises_error(self):
6464
"write.target-file-size-bytes",
6565
]
6666
object.__setattr__(config, "properties", mock_props)
67-
with pytest.raises(ValueError, match="Invalid duplicate properties"):
67+
with pytest.raises(ValueError, match="Invalid duplicate properties:.*write.target-file-size-bytes"):
6868
config.validate_property_keys()
6969

7070
def test_no_duplicate_keys_passes(self):
71-
"""Test that unique approved keys pass duplicate validation."""
71+
"""Test that unique allowed keys pass duplicate validation."""
7272
config = IcebergProperties(properties={"write.target-file-size-bytes": "536870912"})
7373
result = config.validate_property_keys()
7474
assert result is config
@@ -346,7 +346,7 @@ def test_raises_error_on_duplicate_keys(self):
346346
mock_props.__bool__ = lambda self: True
347347
object.__setattr__(props, "properties", mock_props)
348348

349-
with pytest.raises(ValueError, match="Invalid duplicate properties"):
349+
with pytest.raises(ValueError, match="Invalid duplicate properties:.*write.target-file-size-bytes"):
350350
self.fg._update_iceberg_properties(iceberg_properties=props)
351351

352352
def test_logs_before_after_property_changes(self, caplog):

v3-examples/ml-ops-examples/v3-feature-store-iceberg-properties-example.ipynb

Lines changed: 44 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
"\n",
2020
"- An AWS account with SageMaker Feature Store access\n",
2121
"- An S3 bucket for offline store data\n",
22-
"- IAM role with permissions for SageMaker, Glue, and S3\n",
22+
"- IAM role with AmazonSageMakerFeatureStoreAccess and AmazonSageMakerFullAccess policies\n",
2323
"- `pyiceberg[glue]>=0.8.0` installed\n",
24-
"- `AWS_DEFAULT_REGION` and `SSL_CERT_FILE` are set"
24+
"- `AWS_DEFAULT_REGION` set if not using passing a region in function calls"
2525
]
2626
},
2727
{
@@ -31,11 +31,14 @@
3131
"source": [
3232
"## What This Notebook Does\n",
3333
"\n",
34-
"1. Creates a Feature Group with Iceberg table format and initial Iceberg properties\n",
35-
"2. Retrieves the Feature Group with its Iceberg properties\n",
36-
"3. Updates Iceberg properties on an existing Feature Group\n",
37-
"4. Verifies the updated properties\n",
38-
"5. Cleans up resources"
34+
"1. Prepare Sample Data and Feature Definitions\n",
35+
"2. Creates a Feature Group with Iceberg table format and initial Iceberg properties\n",
36+
" - Shows IcebergProperties validation\n",
37+
"3. Retrieves the Feature Group with its Iceberg properties\n",
38+
"4. Updates Iceberg properties on an existing Feature Group\n",
39+
"5. Verifies the updated properties\n",
40+
"6. Cleans up resources\n",
41+
"7. Allowed Iceberg Properties Reference"
3942
]
4043
},
4144
{
@@ -62,19 +65,6 @@
6265
"from sagemaker.mlops.feature_store.feature_utils import load_feature_definitions_from_dataframe"
6366
]
6467
},
65-
{
66-
"cell_type": "code",
67-
"execution_count": null,
68-
"id": "4b2118e8",
69-
"metadata": {},
70-
"outputs": [],
71-
"source": [
72-
"#REMOVE\n",
73-
"import os\n",
74-
"os.environ[\"AWS_DEFAULT_REGION\"] = os.environ.get(\"AWS_DEFAULT_REGION\", \"us-west-2\")\n",
75-
"os.environ[\"SSL_CERT_FILE\"] = \"/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem\""
76-
]
77-
},
7868
{
7969
"cell_type": "code",
8070
"execution_count": null,
@@ -135,7 +125,7 @@
135125
"\n",
136126
"Create a Feature Group with `table_format='Iceberg'` in the `OfflineStoreConfig` and pass initial Iceberg properties via `IcebergProperties`. The properties are applied to the underlying Glue Iceberg table after the Feature Group reaches `Created` status. \n",
137127
"\n",
138-
"Objects of type `IcebergProperties` will validate on creation but not mutation so you can optionally validate your iceberg properties before passing them to the `create`, or `update` methods. If you do not validate and your keys are not properly formatted or approved, then you may recieve an error in your `create` or `update` call."
128+
"Objects of type `IcebergProperties` will validate on creation but not mutation so you can optionally validate your iceberg properties before passing them to the `create`, or `update` methods. If you do not validate and your keys are not properly formatted or allowed, then you may recieve an error in your `create` or `update` call."
139129
]
140130
},
141131
{
@@ -159,7 +149,7 @@
159149
" \"write.metadata.previous-versions-max\": \"5\",\n",
160150
" \"history.expire.max-snapshot-age-ms\": \"86400000\",\n",
161151
"})\n",
162-
"# Add invalid property\n",
152+
"# Add non-allowed property\n",
163153
"invalid_iceberg_props.properties.update({\"write.delete.isolation-level\": \"Snapshot\"})\n",
164154
"try:\n",
165155
" invalid_iceberg_props.validate_property_keys()\n",
@@ -291,43 +281,16 @@
291281
},
292282
{
293283
"cell_type": "markdown",
294-
"id": "c5d6e7f8",
284+
"id": "de15cb88",
295285
"metadata": {},
296286
"source": [
297-
"## 6. Approved Iceberg Properties Reference\n",
298-
"\n",
299-
"The following Iceberg properties can be configured on Feature Store offline stores:\n",
300-
"\n",
301-
"| Category | Property | Description |\n",
302-
"|----------|----------|-------------|\n",
303-
"| **Write** | `write.target-file-size-bytes` | Target size for data files |\n",
304-
"| **Write** | `write.delete.target-file-size-bytes` | Target size for delete files |\n",
305-
"| **Write** | `write.parquet.row-group-size-bytes` | Parquet row group size |\n",
306-
"| **Write** | `write.delete.mode` | Delete operation mode |\n",
307-
"| **Write** | `write.update.mode` | Update operation mode |\n",
308-
"| **Write** | `write.delete.granularity` | Delete granularity |\n",
309-
"| **Metadata** | `write.metadata.delete-after-commit.enabled` | Auto-delete old metadata files |\n",
310-
"| **Metadata** | `write.metadata.previous-versions-max` | Max previous metadata versions to keep |\n",
311-
"| **History** | `history.expire.max-snapshot-age-ms` | Max snapshot age before expiration |\n",
312-
"| **History** | `history.expire.min-snapshots-to-keep` | Min snapshots to retain |\n",
313-
"| **History** | `history.expire.max-ref-age-ms` | Max reference age |\n",
314-
"| **Read** | `read.split.target-size` | Target split size for reads |\n",
315-
"| **Read** | `read.split.metadata-target-size` | Metadata target split size |\n",
316-
"| **Read** | `read.split.open-file-cost` | Cost of opening a file for split planning |"
317-
]
318-
},
319-
{
320-
"cell_type": "markdown",
321-
"id": "d6e7f8a9",
322-
"metadata": {},
323-
"source": [
324-
"## 7. Cleanup"
287+
"## 6. Cleanup"
325288
]
326289
},
327290
{
328291
"cell_type": "code",
329292
"execution_count": null,
330-
"id": "e7f8a9b0",
293+
"id": "992e73e6",
331294
"metadata": {},
332295
"outputs": [],
333296
"source": [
@@ -337,6 +300,35 @@
337300
"except Exception as e:\n",
338301
" print(f\"Cleanup error: {e}\")"
339302
]
303+
},
304+
{
305+
"cell_type": "markdown",
306+
"id": "c5d6e7f8",
307+
"metadata": {},
308+
"source": [
309+
"## 7. Allowed Iceberg Properties Reference\n",
310+
"\n",
311+
"The following Iceberg properties are allowed and can be configured on Feature Store offline stores:\n",
312+
"\n",
313+
"| Property | Default Value | Documentation | \n",
314+
" |----------|---------------|---------------| \n",
315+
" | write.metadata.delete-after-commit.enabled | FALSE | Controls whether to delete the oldest *tracked* version metadata files after each table commit. | \n",
316+
" | write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to track | \n",
317+
" | history.expire.max-snapshot-age-ms | 432000000 (5 days) | Default max age of snapshots to keep on the table and all of its branches while expiring snapshots | \n",
318+
" | history.expire.min-snapshots-to-keep | 1 | Default min number of snapshots to keep on the table and all of its branches while expiring snapshots | \n",
319+
" | write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | \n",
320+
" | read.split.target-size | 134217728 (128 MB) | Target size when combining data input splits | \n",
321+
" | read.split.metadata-target-size | 33554432 (32 MB) | Target size when combining metadata input splits | \n",
322+
" | write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | \n",
323+
" | write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 and above) | \n",
324+
" | write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 and above) | \n",
325+
" | write.delete.granularity | partition | Controls the granularity of generated delete files: partition or file | \n",
326+
" | history.expire.max-ref-age-ms | Long.MAX_VALUE (forever) | For snapshot references except the main branch, default max age of snapshot references to keep while expiring snapshots. The main branch never expires. | \n",
327+
" | read.split.open-file-cost | 4194304 (4 MB) | The estimated cost to open a file, used as a minimum weight when combining splits. | \n",
328+
" | write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | \n",
329+
"\n",
330+
" Documentation from https://iceberg.apache.org/docs/latest/configuration/#write-properties"
331+
]
340332
}
341333
],
342334
"metadata": {

0 commit comments

Comments
 (0)