diff --git a/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md b/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md index a4c79c82550c..7f7946025f93 100644 --- a/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md +++ b/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md @@ -6,6 +6,11 @@ This version and all future versions will require Python 3.9+. Python 3.8 is no ### Features Added +### Bugs Fixed +- Fixed an `IndexError` that occurred when listing change feed events on accounts where the +`$blobchangefeed/idx/segments/` hierarchy contains directory marker blobs (e.g. +`idx/segments/2026/02/20`). Such non-segment paths are now skipped instead of being parsed +as segment files. ## 12.0.0b5 (2024-04-16) diff --git a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py index fe3fc1b4378e..1baacbbd99ae 100644 --- a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py +++ b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py @@ -282,7 +282,10 @@ def _get_segment_paths(self, start_year=""): while not start_year or start_year <= cur_year: paths = self.client.list_blobs(name_starts_with=SEGMENT_COMMON_PATH + str(start_year)) for path in paths: - yield path.name + # Skip directory marker blobs that do not conform to the expected segment path shape. + # Azure Storage can return zero-length directory markers that are not real segment files. + if self._is_valid_segment_path(path.name): + yield path.name # if not searching by prefix, all paths would have been iterated already, so it"s time to yield None if not start_year: @@ -291,6 +294,13 @@ def _get_segment_paths(self, start_year=""): start_year += 1 yield None + @staticmethod + def _is_valid_segment_path(segment_path): + path_tokens = segment_path.split(PATH_DELIMITER) + + # Expected: idx/segments/YYYY/MM/DD/HHMM/ + return len(path_tokens) >= 7 and path_tokens[6] + @staticmethod def _parse_datetime_from_segment_path(segment_path): path_tokens = segment_path.split("/") diff --git a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py index f3102b49057d..7affbb71d13b 100644 --- a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py +++ b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py @@ -8,6 +8,7 @@ from datetime import datetime, timedelta from math import ceil from time import sleep +from unittest.mock import Mock, patch import pytest @@ -23,11 +24,80 @@ # Then uncomment this import and comment out the other. # from changefeed import ChangeFeedClient from azure.storage.blob.changefeed import ChangeFeedClient +from azure.storage.blob.changefeed._models import ChangeFeed + + +def _build_change_feed(blob_names): + """Build a ChangeFeed backed by a mock client, skipping the network-bound _initialize().""" + blobs = [Mock() for _ in blob_names] + for blob, name in zip(blobs, blob_names): + blob.name = name + client = Mock() + client.list_blobs.return_value = blobs + with patch.object(ChangeFeed, "_initialize"): + return ChangeFeed(client, page_size=100) @pytest.mark.playback_test_only class TestStorageChangeFeed(StorageRecordedTestCase): + @pytest.mark.parametrize( + "segment_path", + [ + "idx/segments/2026/02/20/0000/meta.json", + "idx/segments/2022/11/28/2300/meta.json", + "idx/segments/1601/01/01/0000/meta.json", + ], + ) + def test_valid_segment_path_is_accepted(self, segment_path): + assert ChangeFeed._is_valid_segment_path(segment_path) is True + + @pytest.mark.parametrize( + "segment_path", + [ + "idx/segments/2026/02/20", # day-level directory marker (the reported crash) + "idx/segments/2026/02/20/0000", # minute-level directory marker + "idx/segments/2026/02", # month-level directory marker + "idx/segments/2026", # year-level directory marker + "idx/segments", # prefix only + "idx/segments/2026/02/20/0000/", # trailing slash -> empty file token + "idx/segments/abcd/02/20/0000/meta.json", # non-numeric year + ], + ) + def test_directory_marker_or_malformed_path_is_rejected(self, segment_path): + assert ChangeFeed._is_valid_segment_path(segment_path) is False + + def test_parse_datetime_from_valid_segment_path(self): + assert ChangeFeed._parse_datetime_from_segment_path( + "idx/segments/2026/02/20/0000/meta.json" + ) == datetime(2026, 2, 20, 0) + + def test_get_segment_paths_skips_directory_markers(self): + blob_names = [ + "idx/segments/2026/02/20", # day-level marker + "idx/segments/2026/02/20/0000", # minute-level marker + "idx/segments/2026/02/20/0000/meta.json", # real segment + "idx/segments/2026/02/20/0100/meta.json", # real segment + ] + change_feed = _build_change_feed(blob_names) + + results = list(change_feed._get_segment_paths(start_year="")) + + # The generator yields a trailing None sentinel to signal "no more segments". + assert results[-1] is None + yielded_segments = [path for path in results if path is not None] + assert yielded_segments == [ + "idx/segments/2026/02/20/0000/meta.json", + "idx/segments/2026/02/20/0100/meta.json", + ] + + def test_get_segment_paths_does_not_raise_on_directory_markers(self): + blob_names = ["idx/segments/2026/02/20", "idx/segments/2026/02/20/0000/meta.json"] + change_feed = _build_change_feed(blob_names) + + yielded_segments = [path for path in change_feed._get_segment_paths(start_year="") if path] + assert yielded_segments == ["idx/segments/2026/02/20/0000/meta.json"] + # --Test cases for change feed ----------------------------------------- @ChangeFeedPreparer() @recorded_by_proxy