diff --git a/.asf.yaml b/.asf.yaml index 1c632570cb..814d2694fe 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -42,7 +42,9 @@ github: required_approving_review_count: 1 required_linear_history: true - del_branch_on_merge: true + pull_requests: + # auto-delete head branches after being merged + del_branch_on_merge: true features: wiki: true issues: true @@ -57,4 +59,5 @@ notifications: commits: commits@iceberg.apache.org issues: issues@iceberg.apache.org pullrequests: issues@iceberg.apache.org + jobs: ci-jobs@iceberg.apache.org jira_options: link label link label diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml index ed912e5083..6bb71e1d94 100644 --- a/.github/workflows/check-md-link.yml +++ b/.github/workflows/check-md-link.yml @@ -30,10 +30,11 @@ on: paths: - '.github/workflows/check-md-link.yml' - 'mkdocs/**' + workflow_dispatch: jobs: markdown-link-check: runs-on: ubuntu-latest steps: - uses: actions/checkout@master - - uses: tcort/github-action-markdown-link-check@v1 + - uses: tcort/github-action-markdown-link-check@e7c7a18363c842693fadde5d41a3bd3573a7a225 diff --git a/.github/workflows/license_check.yml b/.github/workflows/license_check.yml index c7e20417f2..41a4cb8020 100644 --- a/.github/workflows/license_check.yml +++ b/.github/workflows/license_check.yml @@ -24,5 +24,5 @@ jobs: rat: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - run: dev/check-license diff --git a/.github/workflows/nightly-pypi-build.yml b/.github/workflows/nightly-pypi-build.yml index 1b0e122986..2195e84854 100644 --- a/.github/workflows/nightly-pypi-build.yml +++ b/.github/workflows/nightly-pypi-build.yml @@ -31,7 +31,7 @@ jobs: outputs: VERSION: ${{ steps.set-version.outputs.VERSION }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 1 @@ -78,8 +78,35 @@ jobs: - name: List downloaded artifacts run: ls -R dist/ - name: Publish to TestPyPI + id: publish-testpypi + continue-on-error: true uses: pypa/gh-action-pypi-publish@release/v1 with: repository-url: https://test.pypi.org/legacy/ skip-existing: true verbose: true + - name: Display error message on publish failure + if: steps.publish-testpypi.outcome == 'failure' + run: | + echo "::error::Failed to publish to TestPyPI" + echo "" + echo "⚠️ TestPyPI Publish Failed" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + echo "This may be due to TestPyPI storage limits." + echo "See: https://docs.pypi.org/project-management/storage-limits" + echo "" + echo "To resolve this issue, use the pypi-cleanup utility to clean up old TestPyPI artifacts:" + echo "https://pypi.org/project/pypi-cleanup/" + echo "" + echo " uvx pypi-cleanup --package pyiceberg --host https://test.pypi.org/ \\" + echo " --verbose -d 10 --do-it --username " + echo "" + echo "Requirements:" + echo " • Must be a maintainer for pyiceberg on TestPyPI" + echo " (https://test.pypi.org/project/pyiceberg)" + echo " • Requires TestPyPI password and 2FA" + echo " • ⚠️ ONLY do this for TestPyPI, NOT for production PyPI!" + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + exit 1 diff --git a/.github/workflows/pypi-build-artifacts.yml b/.github/workflows/pypi-build-artifacts.yml index 705ec711a5..a3f6006b44 100644 --- a/.github/workflows/pypi-build-artifacts.yml +++ b/.github/workflows/pypi-build-artifacts.yml @@ -35,7 +35,7 @@ jobs: os: [ ubuntu-latest, windows-latest, macos-latest ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 1 @@ -61,7 +61,7 @@ jobs: if: startsWith(matrix.os, 'ubuntu') - name: Build wheels - uses: pypa/cibuildwheel@v3.2.1 + uses: pypa/cibuildwheel@v3.3.0 with: output-dir: wheelhouse config-file: "pyproject.toml" diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml index cb37233137..6b3fc7f4b2 100644 --- a/.github/workflows/python-ci-docs.yml +++ b/.github/workflows/python-ci-docs.yml @@ -35,7 +35,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: 3.12 diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 93a05094c5..b7c44f922d 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -50,7 +50,7 @@ jobs: python: ['3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} @@ -74,7 +74,7 @@ jobs: python: ['3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} diff --git a/.github/workflows/python-release-docs.yml b/.github/workflows/python-release-docs.yml index d6c9ba55ce..1706ab96fe 100644 --- a/.github/workflows/python-release-docs.yml +++ b/.github/workflows/python-release-docs.yml @@ -30,7 +30,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index c90d818f16..d02de1fdd2 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -89,7 +89,7 @@ jobs: needs: - validate-inputs steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 1 diff --git a/.github/workflows/svn-build-artifacts.yml b/.github/workflows/svn-build-artifacts.yml index 18fd6419ae..62d379905a 100644 --- a/.github/workflows/svn-build-artifacts.yml +++ b/.github/workflows/svn-build-artifacts.yml @@ -35,7 +35,7 @@ jobs: os: [ ubuntu-latest, windows-latest, macos-latest ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 1 @@ -56,7 +56,7 @@ jobs: if: startsWith(matrix.os, 'ubuntu') - name: Build wheels - uses: pypa/cibuildwheel@v3.2.1 + uses: pypa/cibuildwheel@v3.3.0 with: output-dir: wheelhouse config-file: "pyproject.toml" diff --git a/Makefile b/Makefile index ba8d119045..3e92891fb1 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,8 @@ # Configuration Variables # ======================== + + PYTHON ?= # Override with e.g. PYTHON=3.11 to use specific Python version PYTEST_ARGS ?= -v -x # Override with e.g. PYTEST_ARGS="-vv --tb=short" COVERAGE ?= 0 # Set COVERAGE=1 to enable coverage: make test COVERAGE=1 @@ -32,9 +34,9 @@ else endif ifeq ($(COVERAGE),1) - TEST_RUNNER = uv run python -m coverage run --parallel-mode --source=pyiceberg -m + TEST_RUNNER = uv run $(PYTHON_ARG) python -m coverage run --parallel-mode --source=pyiceberg -m else - TEST_RUNNER = uv run python -m + TEST_RUNNER = uv run $(PYTHON_ARG) python -m endif ifeq ($(KEEP_COMPOSE),1) @@ -70,7 +72,7 @@ setup-venv: ## Create virtual environment uv venv $(PYTHON_ARG) install-dependencies: setup-venv ## Install all dependencies including extras - uv sync --all-extras + uv sync $(PYTHON_ARG) --all-extras install: install-uv install-dependencies ## Install uv and dependencies @@ -84,7 +86,7 @@ check-license: ## Check license headers ./dev/check-license lint: ## Run code linters via prek (pre-commit hooks) - uv run prek run -a + uv run $(PYTHON_ARG) prek run -a # =============== # Testing Section @@ -101,7 +103,7 @@ test-integration-setup: ## Start Docker services for integration tests docker compose -f dev/docker-compose-integration.yml kill docker compose -f dev/docker-compose-integration.yml rm -f docker compose -f dev/docker-compose-integration.yml up -d --wait - uv run python dev/provision.py + uv run $(PYTHON_ARG) python dev/provision.py test-integration-exec: ## Run integration tests (excluding provision) $(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS) @@ -133,10 +135,10 @@ test-coverage: COVERAGE=1 test-coverage: test test-integration test-s3 test-adls test-gcs coverage-report ## Run all tests with coverage and report coverage-report: ## Combine and report coverage - uv run coverage combine - uv run coverage report -m --fail-under=$(COVERAGE_FAIL_UNDER) - uv run coverage html - uv run coverage xml + uv run $(PYTHON_ARG) coverage combine + uv run $(PYTHON_ARG) coverage report -m --fail-under=$(COVERAGE_FAIL_UNDER) + uv run $(PYTHON_ARG) coverage html + uv run $(PYTHON_ARG) coverage xml # ================ # Documentation @@ -145,13 +147,13 @@ coverage-report: ## Combine and report coverage ##@ Documentation docs-install: ## Install docs dependencies (included in default groups) - uv sync --group docs + uv sync $(PYTHON_ARG) --group docs docs-serve: ## Serve local docs preview (hot reload) - uv run mkdocs serve -f mkdocs/mkdocs.yml + uv run $(PYTHON_ARG) mkdocs serve -f mkdocs/mkdocs.yml docs-build: ## Build the static documentation site - uv run mkdocs build -f mkdocs/mkdocs.yml --strict + uv run $(PYTHON_ARG) mkdocs build -f mkdocs/mkdocs.yml --strict # =================== # Project Maintenance diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 27c1dc31fc..16e996dba8 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -5,5 +5,3 @@ build .gitignore uv.lock mkdocs/* -setup.cfg -(^|.*/)[^/]*\.egg-info(/.*)?$ diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 3eb6869d8e..5219010279 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -551,6 +551,28 @@ catalog: type: noop ``` +##### GCP BigLake Metastore Catalog REST + +```yaml +catalog: + biglake_catalog: + type: rest + uri: https://biglake.googleapis.com/iceberg/v1/restcatalog + warehouse: gs:// # Use bq://projects/ for federation option (see docs) + auth: + type: google + header.x-goog-user-project: + header.X-Iceberg-Access-Delegation: "" # For user-credentials authentication, set to empty string. +``` + + + +!!! Note "Metastore Authentication Models" + If your BigLake Metastore catalog is configured for "user credentials" authentication instead of "vendor credentials", set the `header.X-Iceberg-Access-Delegation` header to an empty string as shown above. Standard GCP Application Default Credentials (ADC) will be used to authenticate requests to the BigLake Metastore REST API. + You can retrieve the configuration details for your BigLake Iceberg catalog at the [GCP Console BigLake Metastore page](https://console.cloud.google.com/biglake/metastore/catalogs). Select your catalog, then find the necessary parameters such as `uri`, `warehouse`, and authentication method (e.g. user-creds or vendor). + + + ### SQL Catalog The SQL catalog requires a database for its backend. PyIceberg supports PostgreSQL and SQLite through psycopg2. The database connection has to be configured using the `uri` property. The init_catalog_tables is optional and defaults to True. If it is set to False, the catalog tables will not be created when the SQLCatalog is initialized. See SQLAlchemy's [documentation for URL format](https://docs.sqlalchemy.org/en/20/core/engines.html#backend-specific-urls): diff --git a/pyiceberg/avro/codecs/__init__.py b/pyiceberg/avro/codecs/__init__.py index d5d3a7c4e5..96a299cc45 100644 --- a/pyiceberg/avro/codecs/__init__.py +++ b/pyiceberg/avro/codecs/__init__.py @@ -26,9 +26,7 @@ from __future__ import annotations -from typing import Dict, Literal, Type - -from typing_extensions import TypeAlias +from typing import Literal, TypeAlias from pyiceberg.avro.codecs.bzip2 import BZip2Codec from pyiceberg.avro.codecs.codec import Codec @@ -40,7 +38,7 @@ AVRO_CODEC_KEY = "avro.codec" -KNOWN_CODECS: Dict[AvroCompressionCodec, Type[Codec] | None] = { +KNOWN_CODECS: dict[AvroCompressionCodec, type[Codec] | None] = { "null": None, "bzip2": BZip2Codec, "snappy": SnappyCodec, @@ -49,4 +47,4 @@ } # Map to convert the naming from Iceberg to Avro -CODEC_MAPPING_ICEBERG_TO_AVRO: Dict[str, str] = {"gzip": "deflate", "zstd": "zstandard"} +CODEC_MAPPING_ICEBERG_TO_AVRO: dict[str, str] = {"gzip": "deflate", "zstd": "zstandard"} diff --git a/pyiceberg/avro/decoder.py b/pyiceberg/avro/decoder.py index 75b3209027..e971d52d48 100644 --- a/pyiceberg/avro/decoder.py +++ b/pyiceberg/avro/decoder.py @@ -18,9 +18,6 @@ from abc import ABC, abstractmethod from io import SEEK_CUR from typing import ( - Dict, - List, - Tuple, cast, ) @@ -67,11 +64,11 @@ def read_int(self) -> int: datum = (n >> 1) ^ -(n & 1) return datum - def read_ints(self, n: int) -> Tuple[int, ...]: + def read_ints(self, n: int) -> tuple[int, ...]: """Read a list of integers.""" return tuple(self.read_int() for _ in range(n)) - def read_int_bytes_dict(self, n: int, dest: Dict[int, bytes]) -> None: + def read_int_bytes_dict(self, n: int, dest: dict[int, bytes]) -> None: """Read a dictionary of integers for keys and bytes for values into a destination dictionary.""" for _ in range(n): k = self.read_int() @@ -85,7 +82,7 @@ def read_float(self) -> float: The float is converted into a 32-bit integer using a method equivalent to Java's floatToIntBits and then encoded in little-endian format. """ - return float(cast(Tuple[float, ...], STRUCT_FLOAT.unpack(self.read(4)))[0]) + return float(cast(tuple[float, ...], STRUCT_FLOAT.unpack(self.read(4)))[0]) def read_double(self) -> float: """Read a value from the stream as a double. @@ -94,7 +91,7 @@ def read_double(self) -> float: The double is converted into a 64-bit integer using a method equivalent to Java's doubleToLongBits and then encoded in little-endian format. """ - return float(cast(Tuple[float, ...], STRUCT_DOUBLE.unpack(self.read(8)))[0]) + return float(cast(tuple[float, ...], STRUCT_DOUBLE.unpack(self.read(8)))[0]) def read_bytes(self) -> bytes: """Bytes are encoded as a long followed by that many bytes of data.""" @@ -152,7 +149,7 @@ def read(self, n: int) -> bytes: """Read n bytes.""" if n < 0: raise ValueError(f"Requested {n} bytes to read, expected positive integer.") - data: List[bytes] = [] + data: list[bytes] = [] n_remaining = n while n_remaining > 0: @@ -181,6 +178,6 @@ def new_decoder(b: bytes) -> BinaryDecoder: except ModuleNotFoundError: import warnings - warnings.warn("Falling back to pure Python Avro decoder, missing Cython implementation") + warnings.warn("Falling back to pure Python Avro decoder, missing Cython implementation", stacklevel=2) return StreamingBinaryDecoder(b) diff --git a/pyiceberg/avro/file.py b/pyiceberg/avro/file.py index 3b91d70d85..8877e8bf80 100644 --- a/pyiceberg/avro/file.py +++ b/pyiceberg/avro/file.py @@ -22,15 +22,12 @@ import io import json import os +from collections.abc import Callable from dataclasses import dataclass from enum import Enum from types import TracebackType from typing import ( - Callable, - Dict, Generic, - List, - Type, TypeVar, ) @@ -77,14 +74,14 @@ def magic(self) -> bytes: return self._data[0] @property - def meta(self) -> Dict[str, str]: + def meta(self) -> dict[str, str]: return self._data[1] @property def sync(self) -> bytes: return self._data[2] - def compression_codec(self) -> Type[Codec] | None: + def compression_codec(self) -> type[Codec] | None: """Get the file's compression codec algorithm from the file's metadata. In the case of a null codec, we return a None indicating that we @@ -146,8 +143,8 @@ class AvroFile(Generic[D]): ) input_file: InputFile read_schema: Schema | None - read_types: Dict[int, Callable[..., StructProtocol]] - read_enums: Dict[int, Callable[..., Enum]] + read_types: dict[int, Callable[..., StructProtocol]] + read_enums: dict[int, Callable[..., Enum]] header: AvroFileHeader schema: Schema reader: Reader @@ -159,8 +156,8 @@ def __init__( self, input_file: InputFile, read_schema: Schema | None = None, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> None: self.input_file = input_file self.read_schema = read_schema @@ -185,7 +182,7 @@ def __enter__(self) -> AvroFile[D]: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" def __iter__(self) -> AvroFile[D]: @@ -240,7 +237,7 @@ def __init__( file_schema: Schema, schema_name: str, record_schema: Schema | None = None, - metadata: Dict[str, str] = EMPTY_DICT, + metadata: dict[str, str] = EMPTY_DICT, ) -> None: self.output_file = output_file self.file_schema = file_schema @@ -267,7 +264,7 @@ def __enter__(self) -> AvroOutputFile[D]: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" self.output_stream.close() @@ -284,7 +281,7 @@ def _write_header(self) -> None: header = AvroFileHeader(MAGIC, meta, self.sync_bytes) construct_writer(META_SCHEMA).write(self.encoder, header) - def compression_codec(self) -> Type[Codec] | None: + def compression_codec(self) -> type[Codec] | None: """Get the file's compression codec algorithm from the file's metadata. In the case of a null codec, we return a None indicating that we @@ -302,7 +299,7 @@ def compression_codec(self) -> Type[Codec] | None: return KNOWN_CODECS[codec_name] # type: ignore - def write_block(self, objects: List[D]) -> None: + def write_block(self, objects: list[D]) -> None: in_memory = io.BytesIO() block_content_encoder = BinaryEncoder(output_stream=in_memory) for obj in objects: diff --git a/pyiceberg/avro/reader.py b/pyiceberg/avro/reader.py index 97c41be473..cf8e5154e7 100644 --- a/pyiceberg/avro/reader.py +++ b/pyiceberg/avro/reader.py @@ -27,15 +27,12 @@ from __future__ import annotations from abc import abstractmethod +from collections.abc import Callable, Mapping from dataclasses import dataclass from dataclasses import field as dataclassfield from decimal import Decimal from typing import ( Any, - Callable, - List, - Mapping, - Tuple, ) from uuid import UUID @@ -319,14 +316,14 @@ class StructReader(Reader): "_hash", "_max_pos", ) - field_readers: Tuple[Tuple[int | None, Reader], ...] + field_readers: tuple[tuple[int | None, Reader], ...] create_struct: Callable[..., StructProtocol] struct: StructType - field_reader_functions = Tuple[Tuple[str | None, int, Callable[[BinaryDecoder], Any] | None], ...] + field_reader_functions = tuple[tuple[str | None, int, Callable[[BinaryDecoder], Any] | None], ...] def __init__( self, - field_readers: Tuple[Tuple[int | None, Reader], ...], + field_readers: tuple[tuple[int | None, Reader], ...], create_struct: Callable[..., StructProtocol], struct: StructType, ) -> None: @@ -338,7 +335,7 @@ def __init__( if not isinstance(self.create_struct(), StructProtocol): raise ValueError(f"Incompatible with StructProtocol: {self.create_struct}") - reading_callbacks: List[Tuple[int | None, Callable[[BinaryDecoder], Any]]] = [] + reading_callbacks: list[tuple[int | None, Callable[[BinaryDecoder], Any]]] = [] max_pos = -1 for pos, field in field_readers: if pos is not None: @@ -394,8 +391,8 @@ def __init__(self, element: Reader) -> None: self._hash = hash(self.element) self._is_int_list = isinstance(self.element, IntegerReader) - def read(self, decoder: BinaryDecoder) -> List[Any]: - read_items: List[Any] = [] + def read(self, decoder: BinaryDecoder) -> list[Any]: + read_items: list[Any] = [] block_count = decoder.read_int() while block_count != 0: if block_count < 0: @@ -461,7 +458,7 @@ def _read_int_int(self, decoder: BinaryDecoder) -> Mapping[int, int]: if block_count == 0: return EMPTY_DICT - contents_array: List[Tuple[int, ...]] = [] + contents_array: list[tuple[int, ...]] = [] while block_count != 0: if block_count < 0: diff --git a/pyiceberg/avro/resolver.py b/pyiceberg/avro/resolver.py index 84805640eb..65f06ca8b1 100644 --- a/pyiceberg/avro/resolver.py +++ b/pyiceberg/avro/resolver.py @@ -15,13 +15,8 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=arguments-renamed,unused-argument +from collections.abc import Callable from enum import Enum -from typing import ( - Callable, - Dict, - List, - Tuple, -) from pyiceberg.avro.decoder import BinaryDecoder from pyiceberg.avro.reader import ( @@ -114,7 +109,7 @@ def construct_reader( - file_schema: Schema | IcebergType, read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT + file_schema: Schema | IcebergType, read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT ) -> Reader: """Construct a reader from a file schema. @@ -146,7 +141,7 @@ class ConstructWriter(SchemaVisitorPerPrimitiveType[Writer]): def schema(self, schema: Schema, struct_result: Writer) -> Writer: return struct_result - def struct(self, struct: StructType, field_results: List[Writer]) -> Writer: + def struct(self, struct: StructType, field_results: list[Writer]) -> Writer: return StructWriter(tuple((pos, result) for pos, result in enumerate(field_results))) def field(self, field: NestedField, field_result: Writer) -> Writer: @@ -234,8 +229,8 @@ def resolve_writer( def resolve_reader( file_schema: Schema | IcebergType, read_schema: Schema | IcebergType, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> Reader: """Resolve the file and read schema to produce a reader. @@ -274,12 +269,12 @@ class WriteSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Writer]): def schema(self, file_schema: Schema, record_schema: IcebergType | None, result: Writer) -> Writer: return result - def struct(self, file_schema: StructType, record_struct: IcebergType | None, file_writers: List[Writer]) -> Writer: + def struct(self, file_schema: StructType, record_struct: IcebergType | None, file_writers: list[Writer]) -> Writer: if not isinstance(record_struct, StructType): raise ResolveError(f"File/write schema are not aligned for struct, got {record_struct}") - record_struct_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(record_struct.fields)} - results: List[Tuple[int | None, Writer]] = [] + record_struct_positions: dict[int, int] = {field.field_id: pos for pos, field in enumerate(record_struct.fields)} + results: list[tuple[int | None, Writer]] = [] for writer, file_field in zip(file_writers, file_schema.fields, strict=True): if file_field.field_id in record_struct_positions: @@ -367,14 +362,14 @@ def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) class ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]): __slots__ = ("read_types", "read_enums", "context") - read_types: Dict[int, Callable[..., StructProtocol]] - read_enums: Dict[int, Callable[..., Enum]] - context: List[int] + read_types: dict[int, Callable[..., StructProtocol]] + read_enums: dict[int, Callable[..., Enum]] + context: list[int] def __init__( self, - read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, - read_enums: Dict[int, Callable[..., Enum]] = EMPTY_DICT, + read_types: dict[int, Callable[..., StructProtocol]] = EMPTY_DICT, + read_enums: dict[int, Callable[..., Enum]] = EMPTY_DICT, ) -> None: self.read_types = read_types self.read_enums = read_enums @@ -389,7 +384,7 @@ def before_field(self, field: NestedField, field_partner: NestedField | None) -> def after_field(self, field: NestedField, field_partner: NestedField | None) -> None: self.context.pop() - def struct(self, struct: StructType, expected_struct: IcebergType | None, field_readers: List[Reader]) -> Reader: + def struct(self, struct: StructType, expected_struct: IcebergType | None, field_readers: list[Reader]) -> Reader: read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT struct_callable = self.read_types.get(read_struct_id, Record) @@ -399,10 +394,10 @@ def struct(self, struct: StructType, expected_struct: IcebergType | None, field_ if not isinstance(expected_struct, StructType): raise ResolveError(f"File/read schema are not aligned for struct, got {expected_struct}") - expected_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)} + expected_positions: dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)} # first, add readers for the file fields that must be in order - results: List[Tuple[int | None, Reader]] = [ + results: list[tuple[int | None, Reader]] = [ ( expected_positions.get(field.field_id), # Check if we need to convert it to an Enum diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py index ba66d3003c..f78d1a486e 100644 --- a/pyiceberg/avro/writer.py +++ b/pyiceberg/avro/writer.py @@ -28,9 +28,6 @@ from dataclasses import field as dataclassfield from typing import ( Any, - Dict, - List, - Tuple, ) from uuid import UUID @@ -186,7 +183,7 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None: @dataclass(frozen=True) class StructWriter(Writer): - field_writers: Tuple[Tuple[int | None, Writer], ...] = dataclassfield() + field_writers: tuple[tuple[int | None, Writer], ...] = dataclassfield() def write(self, encoder: BinaryEncoder, val: Record) -> None: for pos, writer in self.field_writers: @@ -210,7 +207,7 @@ def __hash__(self) -> int: class ListWriter(Writer): element_writer: Writer - def write(self, encoder: BinaryEncoder, val: List[Any]) -> None: + def write(self, encoder: BinaryEncoder, val: list[Any]) -> None: encoder.write_int(len(val)) for v in val: self.element_writer.write(encoder, v) @@ -223,7 +220,7 @@ class MapWriter(Writer): key_writer: Writer value_writer: Writer - def write(self, encoder: BinaryEncoder, val: Dict[Any, Any]) -> None: + def write(self, encoder: BinaryEncoder, val: dict[Any, Any]) -> None: encoder.write_int(len(val)) for k, v in val.items(): self.key_writer.write(encoder, k) diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 5b39062948..a4f1d47bea 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -22,17 +22,12 @@ import re import uuid from abc import ABC, abstractmethod +from collections.abc import Callable from dataclasses import dataclass from enum import Enum from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, - List, - Set, - Tuple, - Type, cast, ) @@ -268,16 +263,16 @@ def load_catalog(name: str | None = None, **properties: str | None) -> Catalog: catalog_type = infer_catalog_type(name, conf) if catalog_type: - return AVAILABLE_CATALOGS[catalog_type](name, cast(Dict[str, str], conf)) + return AVAILABLE_CATALOGS[catalog_type](name, cast(dict[str, str], conf)) raise ValueError(f"Could not initialize catalog with the following properties: {properties}") -def list_catalogs() -> List[str]: +def list_catalogs() -> list[str]: return _ENV_CONFIG.get_known_catalogs() -def delete_files(io: FileIO, files_to_delete: Set[str], file_type: str) -> None: +def delete_files(io: FileIO, files_to_delete: set[str], file_type: str) -> None: """Delete files. Log warnings if failing to delete any file. @@ -294,7 +289,7 @@ def delete_files(io: FileIO, files_to_delete: Set[str], file_type: str) -> None: logger.warning(msg=f"Failed to delete {file_type} file {file}", exc_info=exc) -def delete_data_files(io: FileIO, manifests_to_delete: List[ManifestFile]) -> None: +def delete_data_files(io: FileIO, manifests_to_delete: list[ManifestFile]) -> None: """Delete data files linked to given manifests. Log warnings if failing to delete any file. @@ -331,9 +326,9 @@ def _import_catalog(name: str, catalog_impl: str, properties: Properties) -> Cat @dataclass class PropertiesUpdateSummary: - removed: List[str] - updated: List[str] - missing: List[str] + removed: list[str] + updated: list[str] + missing: list[str] class Catalog(ABC): @@ -361,7 +356,7 @@ def __init__(self, name: str, **properties: str): def create_table( self, identifier: str | Identifier, - schema: Schema | "pa.Schema", + schema: Schema | pa.Schema, location: str | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, sort_order: SortOrder = UNSORTED_SORT_ORDER, @@ -388,7 +383,7 @@ def create_table( def create_table_transaction( self, identifier: str | Identifier, - schema: Schema | "pa.Schema", + schema: Schema | pa.Schema, location: str | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, sort_order: SortOrder = UNSORTED_SORT_ORDER, @@ -411,7 +406,7 @@ def create_table_transaction( def create_table_if_not_exists( self, identifier: str | Identifier, - schema: Schema | "pa.Schema", + schema: Schema | pa.Schema, location: str | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, sort_order: SortOrder = UNSORTED_SORT_ORDER, @@ -531,7 +526,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I @abstractmethod def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -586,7 +581,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: """ @abstractmethod - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List tables under the given namespace in the catalog. Args: @@ -600,7 +595,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: """ @abstractmethod - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Args: @@ -614,7 +609,7 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: """ @abstractmethod - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: """List views under the given namespace in the catalog. Args: @@ -643,7 +638,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: @abstractmethod def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and updates properties for a namespace. @@ -707,7 +702,7 @@ def namespace_from(identifier: str | Identifier) -> Identifier: return Catalog.identifier_to_tuple(identifier)[:-1] @staticmethod - def namespace_to_string(identifier: str | Identifier, err: Type[ValueError] | Type[NoSuchNamespaceError] = ValueError) -> str: + def namespace_to_string(identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError) -> str: """Transform a namespace identifier into a string. Args: @@ -729,7 +724,7 @@ def namespace_to_string(identifier: str | Identifier, err: Type[ValueError] | Ty @staticmethod def identifier_to_database( - identifier: str | Identifier, err: Type[ValueError] | Type[NoSuchNamespaceError] = ValueError + identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError ) -> str: tuple_identifier = Catalog.identifier_to_tuple(identifier) if len(tuple_identifier) != 1: @@ -740,8 +735,8 @@ def identifier_to_database( @staticmethod def identifier_to_database_and_table( identifier: str | Identifier, - err: Type[ValueError] | Type[NoSuchTableError] | Type[NoSuchNamespaceError] = ValueError, - ) -> Tuple[str, str]: + err: type[ValueError] | type[NoSuchTableError] | type[NoSuchNamespaceError] = ValueError, + ) -> tuple[str, str]: tuple_identifier = Catalog.identifier_to_tuple(identifier) if len(tuple_identifier) != 2: raise err(f"Invalid path, hierarchical namespaces are not supported: {identifier}") @@ -753,7 +748,7 @@ def _load_file_io(self, properties: Properties = EMPTY_DICT, location: str | Non @staticmethod def _convert_schema_if_needed( - schema: Schema | "pa.Schema", format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION + schema: Schema | pa.Schema, format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION ) -> Schema: if isinstance(schema, Schema): return schema @@ -799,7 +794,7 @@ def close(self) -> None: # noqa: B027 Default implementation does nothing. Override in subclasses that need cleanup. """ - def __enter__(self) -> "Catalog": + def __enter__(self) -> Catalog: """Enter the context manager. Returns: @@ -829,7 +824,7 @@ def __init__(self, name: str, **properties: str): def create_table_transaction( self, identifier: str | Identifier, - schema: Schema | "pa.Schema", + schema: Schema | pa.Schema, location: str | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, sort_order: SortOrder = UNSORTED_SORT_ORDER, @@ -852,7 +847,7 @@ def purge_table(self, identifier: str | Identifier) -> None: io = load_file_io(self.properties, table.metadata_location) metadata = table.metadata manifest_lists_to_delete = set() - manifests_to_delete: List[ManifestFile] = [] + manifests_to_delete: list[ManifestFile] = [] for snapshot in metadata.snapshots: manifests_to_delete += snapshot.manifests(io) manifest_lists_to_delete.add(snapshot.manifest_list) @@ -869,7 +864,7 @@ def purge_table(self, identifier: str | Identifier) -> None: def _create_staged_table( self, identifier: str | Identifier, - schema: Schema | "pa.Schema", + schema: Schema | pa.Schema, location: str | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, sort_order: SortOrder = UNSORTED_SORT_ORDER, @@ -914,8 +909,8 @@ def _update_and_stage_table( self, current_table: Table | None, table_identifier: Identifier, - requirements: Tuple[TableRequirement, ...], - updates: Tuple[TableUpdate, ...], + requirements: tuple[TableRequirement, ...], + updates: tuple[TableUpdate, ...], ) -> StagedTable: for requirement in requirements: requirement.validate(current_table.metadata if current_table else None) @@ -940,13 +935,13 @@ def _update_and_stage_table( ) def _get_updated_props_and_update_summary( - self, current_properties: Properties, removals: Set[str] | None, updates: Properties - ) -> Tuple[PropertiesUpdateSummary, Properties]: + self, current_properties: Properties, removals: set[str] | None, updates: Properties + ) -> tuple[PropertiesUpdateSummary, Properties]: self._check_for_overlap(updates=updates, removals=removals) updated_properties = dict(current_properties) - removed: Set[str] = set() - updated: Set[str] = set() + removed: set[str] = set() + updated: set[str] = set() if removals: for key in removals: @@ -1028,7 +1023,7 @@ def _parse_metadata_version(metadata_location: str) -> int: return -1 @staticmethod - def _check_for_overlap(removals: Set[str] | None, updates: Properties) -> None: + def _check_for_overlap(removals: set[str] | None, updates: Properties) -> None: if updates and removals: overlap = set(removals) & set(updates.keys()) if overlap: diff --git a/pyiceberg/catalog/bigquery_metastore.py b/pyiceberg/catalog/bigquery_metastore.py index 4b1b922b41..b762c1047c 100644 --- a/pyiceberg/catalog/bigquery_metastore.py +++ b/pyiceberg/catalog/bigquery_metastore.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import json -from typing import TYPE_CHECKING, Any, List, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Union from google.api_core.exceptions import NotFound from google.cloud.bigquery import Client, Dataset, DatasetReference, TableReference @@ -227,7 +227,7 @@ def drop_table(self, identifier: str | Identifier) -> None: raise NoSuchTableError(f"Table does not exist: {dataset_name}.{table_name}") from e def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: raise NotImplementedError @@ -244,9 +244,9 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except NotFound as e: raise NoSuchNamespaceError(f"Namespace {namespace} does not exist.") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: database_name = self.identifier_to_database(namespace) - iceberg_tables: List[Identifier] = [] + iceberg_tables: list[Identifier] = [] try: dataset_ref = DatasetReference(project=self.project_id, dataset_id=database_name) # The list_tables method returns an iterator of TableListItem @@ -258,7 +258,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: raise NoSuchNamespaceError(f"Namespace (dataset) '{database_name}' not found.") from None return iceberg_tables - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: # Since this catalog only supports one-level namespaces, it always returns an empty list unless # passed an empty namespace to list all namespaces within the catalog. if namespace: @@ -299,7 +299,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self.load_table(identifier=identifier) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: @@ -321,7 +321,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return {} def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: raise NotImplementedError diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index 59ce9f1b13..2d35b2c5e2 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -19,11 +19,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, Optional, - Set, - Tuple, Union, ) @@ -228,7 +224,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - raise NotImplementedError def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -400,7 +396,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except ConditionalCheckFailedException as e: raise NoSuchNamespaceError(f"Database does not exist: {database_name}") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. Args: @@ -444,7 +440,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: return table_identifiers - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List top-level namespaces from the catalog. We do not support hierarchical namespace. @@ -505,7 +501,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return _get_namespace_properties(namespace_dict=namespace_dict) def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """ Remove or update provided property keys for a namespace. @@ -541,7 +537,7 @@ def update_namespace_properties( return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: @@ -550,22 +546,22 @@ def drop_view(self, identifier: str | Identifier) -> None: def view_exists(self, identifier: str | Identifier) -> bool: raise NotImplementedError - def _get_iceberg_table_item(self, database_name: str, table_name: str) -> Dict[str, Any]: + def _get_iceberg_table_item(self, database_name: str, table_name: str) -> dict[str, Any]: try: return self._get_dynamo_item(identifier=f"{database_name}.{table_name}", namespace=database_name) except ValueError as e: raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e - def _get_iceberg_namespace_item(self, database_name: str) -> Dict[str, Any]: + def _get_iceberg_namespace_item(self, database_name: str) -> dict[str, Any]: try: return self._get_dynamo_item(identifier=DYNAMODB_NAMESPACE, namespace=database_name) except ValueError as e: raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") from e - def _ensure_namespace_exists(self, database_name: str) -> Dict[str, Any]: + def _ensure_namespace_exists(self, database_name: str) -> dict[str, Any]: return self._get_iceberg_namespace_item(database_name) - def _get_dynamo_item(self, identifier: str, namespace: str) -> Dict[str, Any]: + def _get_dynamo_item(self, identifier: str, namespace: str) -> dict[str, Any]: try: response = self.dynamodb.get_item( TableName=self.dynamodb_table_name, @@ -592,7 +588,7 @@ def _get_dynamo_item(self, identifier: str, namespace: str) -> Dict[str, Any]: ) as e: raise GenericDynamoDbError(e.message) from e - def _put_dynamo_item(self, item: Dict[str, Any], condition_expression: str) -> None: + def _put_dynamo_item(self, item: dict[str, Any], condition_expression: str) -> None: try: self.dynamodb.put_item(TableName=self.dynamodb_table_name, Item=item, ConditionExpression=condition_expression) except self.dynamodb.exceptions.ConditionalCheckFailedException as e: @@ -635,7 +631,7 @@ def _delete_dynamo_item(self, namespace: str, identifier: str, condition_express ) as e: raise GenericDynamoDbError(e.message) from e - def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[str, Any]) -> Table: + def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: dict[str, Any]) -> Table: table_dict = _convert_dynamo_item_to_regular_dict(dynamo_table_item) for prop in [_add_property_prefix(prop) for prop in (TABLE_TYPE, METADATA_LOCATION)] + [ @@ -672,7 +668,7 @@ def _get_default_warehouse_location(self, database_name: str, table_name: str) - return self._get_hive_style_warehouse_location(database_name, table_name) -def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]: +def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { DYNAMODB_COL_IDENTIFIER: { @@ -702,7 +698,7 @@ def _get_create_table_item(database_name: str, table_name: str, properties: Prop return _dict -def _get_rename_table_item(from_dynamo_table_item: Dict[str, Any], to_database_name: str, to_table_name: str) -> Dict[str, Any]: +def _get_rename_table_item(from_dynamo_table_item: dict[str, Any], to_database_name: str, to_table_name: str) -> dict[str, Any]: _dict = from_dynamo_table_item current_timestamp_ms = str(round(time() * 1000)) _dict[DYNAMODB_COL_IDENTIFIER]["S"] = f"{to_database_name}.{to_table_name}" @@ -712,7 +708,7 @@ def _get_rename_table_item(from_dynamo_table_item: Dict[str, Any], to_database_n return _dict -def _get_create_database_item(database_name: str, properties: Properties) -> Dict[str, Any]: +def _get_create_database_item(database_name: str, properties: Properties) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { DYNAMODB_COL_IDENTIFIER: { @@ -738,7 +734,7 @@ def _get_create_database_item(database_name: str, properties: Properties) -> Dic return _dict -def _get_update_database_item(namespace_item: Dict[str, Any], updated_properties: Properties) -> Dict[str, Any]: +def _get_update_database_item(namespace_item: dict[str, Any], updated_properties: Properties) -> dict[str, Any]: current_timestamp_ms = str(round(time() * 1000)) _dict = { @@ -802,11 +798,11 @@ def _get_update_database_item(namespace_item: Dict[str, Any], updated_properties ] -def _get_namespace_properties(namespace_dict: Dict[str, str]) -> Properties: +def _get_namespace_properties(namespace_dict: dict[str, str]) -> Properties: return {_remove_property_prefix(key): val for key, val in namespace_dict.items() if key.startswith(PROPERTY_KEY_PREFIX)} -def _convert_dynamo_item_to_regular_dict(dynamo_json: Dict[str, Any]) -> Dict[str, str]: +def _convert_dynamo_item_to_regular_dict(dynamo_json: dict[str, Any]) -> dict[str, str]: """Convert a dynamo json to a regular json. Example of a dynamo json: diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index f19cb6dec0..7260b29447 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -19,11 +19,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, Optional, - Set, - Tuple, Union, cast, ) @@ -178,7 +174,7 @@ class _IcebergSchemaToGlueType(SchemaVisitor[str]): def schema(self, schema: Schema, struct_result: str) -> str: return struct_result - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: list[str]) -> str: return f"struct<{','.join(field_results)}>" def field(self, field: NestedField, field_result: str) -> str: @@ -198,8 +194,8 @@ def primitive(self, primitive: PrimitiveType) -> str: return GLUE_PRIMITIVE_TYPES[primitive_type] -def _to_columns(metadata: TableMetadata) -> List["ColumnTypeDef"]: - results: Dict[str, "ColumnTypeDef"] = {} +def _to_columns(metadata: TableMetadata) -> list["ColumnTypeDef"]: + results: dict[str, ColumnTypeDef] = {} def _append_to_results(field: NestedField, is_current: bool) -> None: if field.name in results: @@ -241,7 +237,7 @@ def _construct_table_input( glue_table: Optional["TableTypeDef"] = None, prev_metadata_location: str | None = None, ) -> "TableInputTypeDef": - table_input: "TableInputTypeDef" = { + table_input: TableInputTypeDef = { "Name": table_name, "TableType": EXTERNAL_TABLE, "Parameters": _construct_parameters(metadata_location, glue_table, prev_metadata_location, properties), @@ -258,7 +254,7 @@ def _construct_table_input( def _construct_rename_table_input(to_table_name: str, glue_table: "TableTypeDef") -> "TableInputTypeDef": - rename_table_input: "TableInputTypeDef" = {"Name": to_table_name} + rename_table_input: TableInputTypeDef = {"Name": to_table_name} # use the same Glue info to create the new table, pointing to the old metadata if not glue_table["TableType"]: raise ValueError("Glue table type is missing, cannot rename table") @@ -283,7 +279,7 @@ def _construct_rename_table_input(to_table_name: str, glue_table: "TableTypeDef" def _construct_database_input(database_name: str, properties: Properties) -> "DatabaseInputTypeDef": - database_input: "DatabaseInputTypeDef" = {"Name": database_name} + database_input: DatabaseInputTypeDef = {"Name": database_name} parameters = {} for k, v in properties.items(): if k == "Description": @@ -305,7 +301,7 @@ def _register_glue_catalog_id_with_glue_client(glue: "GlueClient", glue_catalog_ """ event_system = glue.meta.events - def add_glue_catalog_id(params: Dict[str, str], **kwargs: Any) -> None: + def add_glue_catalog_id(params: dict[str, str], **kwargs: Any) -> None: if "CatalogId" not in params: params["CatalogId"] = glue_catalog_id @@ -487,7 +483,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self.load_table(identifier=identifier) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -506,7 +502,7 @@ def commit_table( table_identifier = table.name() database_name, table_name = self.identifier_to_database_and_table(table_identifier, NoSuchTableError) - current_glue_table: "TableTypeDef" | None + current_glue_table: TableTypeDef | None glue_table_version_id: str | None current_table: Table | None try: @@ -705,7 +701,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: ) self.glue.delete_database(Name=database_name) - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. Args: @@ -718,7 +714,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: NoSuchNamespaceError: If a namespace with the given name does not exist, or the identifier is invalid. """ database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - table_list: List["TableTypeDef"] = [] + table_list: list[TableTypeDef] = [] next_token: str | None = None try: while True: @@ -736,7 +732,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: raise NoSuchNamespaceError(f"Database does not exist: {database_name}") from e return [(database_name, table["Name"]) for table in table_list if self.__is_iceberg_table(table)] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Returns: @@ -746,7 +742,7 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: if namespace: return [] - database_list: List["DatabaseTypeDef"] = [] + database_list: list[DatabaseTypeDef] = [] next_token: str | None = None while True: @@ -789,7 +785,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return properties def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and updates properties for a namespace. @@ -812,7 +808,7 @@ def update_namespace_properties( return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def drop_view(self, identifier: str | Identifier) -> None: diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index a6f7131b06..e096470451 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -22,11 +22,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, - Set, - Tuple, - Type, Union, ) from urllib.parse import urlparse @@ -148,7 +143,7 @@ class _HiveClient: """Helper class to nicely open and close the transport.""" _transport: TTransport - _ugi: List[str] | None + _ugi: list[str] | None def __init__( self, @@ -194,7 +189,7 @@ def __enter__(self) -> Client: self._transport.open() return self._client() # recreate the client - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Close transport if it was opened.""" if self._transport.isOpen(): self._transport.close() @@ -223,7 +218,7 @@ def _construct_hive_storage_descriptor(schema: Schema, location: str | None, hiv def _construct_parameters( metadata_location: str, previous_metadata_location: str | None = None, metadata_properties: Properties | None = None -) -> Dict[str, Any]: +) -> dict[str, Any]: properties = {PROP_EXTERNAL: "TRUE", PROP_TABLE_TYPE: "ICEBERG", PROP_METADATA_LOCATION: metadata_location} if previous_metadata_location: properties[PROP_PREVIOUS_METADATA_LOCATION] = previous_metadata_location @@ -276,7 +271,7 @@ def __init__(self, hive2_compatible: bool): def schema(self, schema: Schema, struct_result: str) -> str: return struct_result - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: list[str]) -> str: return f"struct<{','.join(field_results)}>" def field(self, field: NestedField, field_result: str) -> str: @@ -315,7 +310,7 @@ def __init__(self, name: str, **properties: str): ) @staticmethod - def _create_hive_client(properties: Dict[str, str]) -> _HiveClient: + def _create_hive_client(properties: dict[str, str]) -> _HiveClient: last_exception = None for uri in properties[URI].split(","): try: @@ -333,7 +328,7 @@ def _create_hive_client(properties: Dict[str, str]) -> _HiveClient: raise ValueError(f"Unable to connect to hive using uri: {properties[URI]}") def _convert_hive_into_iceberg(self, table: HiveTable) -> Table: - properties: Dict[str, str] = table.parameters + properties: dict[str, str] = table.parameters if TABLE_TYPE not in properties: raise NoSuchPropertyException( f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}" @@ -469,7 +464,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self._convert_hive_into_iceberg(hive_table) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: @@ -505,7 +500,7 @@ def _do_wait_for_lock() -> LockResponse: return _do_wait_for_lock() def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -715,7 +710,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: except MetaException as e: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List Iceberg tables under the given namespace in the catalog. When the database doesn't exist, it will just return an empty list. @@ -739,7 +734,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: if table.parameters.get(TABLE_TYPE, "").lower() == ICEBERG ] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Returns: @@ -777,7 +772,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and update properties for a namespace. @@ -799,8 +794,8 @@ def update_namespace_properties( except NoSuchObjectException as e: raise NoSuchNamespaceError(f"Database does not exists: {database_name}") from e - removed: Set[str] = set() - updated: Set[str] = set() + removed: set[str] = set() + updated: set[str] = set() if removals: for key in removals: diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py index 08b71d90af..ac2423c198 100644 --- a/pyiceberg/catalog/noop.py +++ b/pyiceberg/catalog/noop.py @@ -16,9 +16,6 @@ # under the License. from typing import ( TYPE_CHECKING, - List, - Set, - Tuple, Union, ) @@ -95,7 +92,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I raise NotImplementedError def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: raise NotImplementedError @@ -105,21 +102,21 @@ def create_namespace(self, namespace: str | Identifier, properties: Properties = def drop_namespace(self, namespace: str | Identifier) -> None: raise NotImplementedError - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: raise NotImplementedError def load_namespace_properties(self, namespace: str | Identifier) -> Properties: raise NotImplementedError def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: raise NotImplementedError - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: diff --git a/pyiceberg/catalog/rest/__init__.py b/pyiceberg/catalog/rest/__init__.py index e9571aa491..3b77fd47f0 100644 --- a/pyiceberg/catalog/rest/__init__.py +++ b/pyiceberg/catalog/rest/__init__.py @@ -18,10 +18,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, - Set, - Tuple, Union, ) @@ -164,11 +160,11 @@ class CreateTableRequest(IcebergBaseModel): partition_spec: PartitionSpec | None = Field(alias="partition-spec") write_order: SortOrder | None = Field(alias="write-order") stage_create: bool = Field(alias="stage-create", default=False) - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) # validators @field_validator("properties", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) @@ -183,7 +179,7 @@ class ConfigResponse(IcebergBaseModel): class ListNamespaceResponse(IcebergBaseModel): - namespaces: List[Identifier] = Field() + namespaces: list[Identifier] = Field() class NamespaceResponse(IcebergBaseModel): @@ -192,9 +188,9 @@ class NamespaceResponse(IcebergBaseModel): class UpdateNamespacePropertiesResponse(IcebergBaseModel): - removed: List[str] = Field() - updated: List[str] = Field() - missing: List[str] = Field() + removed: list[str] = Field() + updated: list[str] = Field() + missing: list[str] = Field() class ListTableResponseEntry(IcebergBaseModel): @@ -208,11 +204,11 @@ class ListViewResponseEntry(IcebergBaseModel): class ListTablesResponse(IcebergBaseModel): - identifiers: List[ListTableResponseEntry] = Field() + identifiers: list[ListTableResponseEntry] = Field() class ListViewsResponse(IcebergBaseModel): - identifiers: List[ListViewResponseEntry] = Field() + identifiers: list[ListViewResponseEntry] = Field() class RestCatalog(Catalog): @@ -346,7 +342,7 @@ def _warn_oauth_tokens_deprecation(self) -> None: "endpoint is explicitly configured. See https://github.com/apache/iceberg/issues/10537", ) - def _extract_optional_oauth_params(self) -> Dict[str, str]: + def _extract_optional_oauth_params(self) -> dict[str, str]: optional_oauth_param = {SCOPE: self.properties.get(SCOPE) or CATALOG_SCOPE} set_of_optional_params = {AUDIENCE, RESOURCE} for param in set_of_optional_params: @@ -391,7 +387,7 @@ def _split_identifier_for_path( return {"namespace": NAMESPACE_SEPARATOR.join(identifier_tuple[:-1]), kind.value: identifier_tuple[-1]} - def _split_identifier_for_json(self, identifier: str | Identifier) -> Dict[str, Identifier | str]: + def _split_identifier_for_json(self, identifier: str | Identifier) -> dict[str, Identifier | str]: identifier_tuple = self._identifier_to_validated_tuple(identifier) return {"namespace": identifier_tuple[:-1], "name": identifier_tuple[-1]} @@ -447,7 +443,7 @@ def add_headers(self, request: PreparedRequest, **kwargs: Any) -> None: # pylin session.mount(self.uri, SigV4Adapter(**self.properties)) - def _response_to_table(self, identifier_tuple: Tuple[str, ...], table_response: TableResponse) -> Table: + def _response_to_table(self, identifier_tuple: tuple[str, ...], table_response: TableResponse) -> Table: return Table( identifier=identifier_tuple, metadata_location=table_response.metadata_location, # type: ignore @@ -459,7 +455,7 @@ def _response_to_table(self, identifier_tuple: Tuple[str, ...], table_response: config=table_response.config, ) - def _response_to_staged_table(self, identifier_tuple: Tuple[str, ...], table_response: TableResponse) -> StagedTable: + def _response_to_staged_table(self, identifier_tuple: tuple[str, ...], table_response: TableResponse) -> StagedTable: return StagedTable( identifier=identifier_tuple, metadata_location=table_response.metadata_location, # type: ignore @@ -602,7 +598,7 @@ def register_table(self, identifier: str | Identifier, metadata_location: str) - return self._response_to_table(self.identifier_to_tuple(identifier), table_response) @retry(**_RETRY_ARGS) - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace_concat = NAMESPACE_SEPARATOR.join(namespace_tuple) response = self._session.get(self.url(Endpoints.list_tables, namespace=namespace_concat)) @@ -683,7 +679,7 @@ def _remove_catalog_name_from_table_request_identifier(self, table_request: Comm return table_request @retry(**_RETRY_ARGS) - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace_concat = NAMESPACE_SEPARATOR.join(namespace_tuple) response = self._session.get(self.url(Endpoints.list_views, namespace=namespace_concat)) @@ -695,7 +691,7 @@ def list_views(self, namespace: str | Identifier) -> List[Identifier]: @retry(**_RETRY_ARGS) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -760,7 +756,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: _handle_non_200_response(exc, {404: NoSuchNamespaceError, 409: NamespaceNotEmptyError}) @retry(**_RETRY_ARGS) - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: namespace_tuple = self.identifier_to_tuple(namespace) response = self._session.get( self.url( @@ -790,7 +786,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: @retry(**_RETRY_ARGS) def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: namespace_tuple = self._check_valid_namespace_identifier(namespace) namespace = NAMESPACE_SEPARATOR.join(namespace_tuple) diff --git a/pyiceberg/catalog/rest/auth.py b/pyiceberg/catalog/rest/auth.py index 7f56f6300b..3fdc837c19 100644 --- a/pyiceberg/catalog/rest/auth.py +++ b/pyiceberg/catalog/rest/auth.py @@ -22,7 +22,7 @@ import time from abc import ABC, abstractmethod from functools import cached_property -from typing import Any, Dict, List, Type +from typing import Any import requests from requests import HTTPError, PreparedRequest, Session @@ -76,7 +76,7 @@ class LegacyOAuth2AuthManager(AuthManager): _auth_url: str | None _token: str | None _credential: str | None - _optional_oauth_params: Dict[str, str] | None + _optional_oauth_params: dict[str, str] | None def __init__( self, @@ -84,7 +84,7 @@ def __init__( auth_url: str | None = None, credential: str | None = None, initial_token: str | None = None, - optional_oauth_params: Dict[str, str] | None = None, + optional_oauth_params: dict[str, str] | None = None, ): self._session = session self._auth_url = auth_url @@ -220,7 +220,7 @@ def auth_header(self) -> str: class GoogleAuthManager(AuthManager): """An auth manager that is responsible for handling Google credentials.""" - def __init__(self, credentials_path: str | None = None, scopes: List[str] | None = None): + def __init__(self, credentials_path: str | None = None, scopes: list[str] | None = None): """ Initialize GoogleAuthManager. @@ -280,10 +280,10 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: class AuthManagerFactory: - _registry: Dict[str, Type["AuthManager"]] = {} + _registry: dict[str, type["AuthManager"]] = {} @classmethod - def register(cls, name: str, auth_manager_class: Type["AuthManager"]) -> None: + def register(cls, name: str, auth_manager_class: type["AuthManager"]) -> None: """ Register a string name to a known AuthManager class. @@ -297,7 +297,7 @@ def register(cls, name: str, auth_manager_class: Type["AuthManager"]) -> None: cls._registry[name] = auth_manager_class @classmethod - def create(cls, class_or_name: str, config: Dict[str, Any]) -> AuthManager: + def create(cls, class_or_name: str, config: dict[str, Any]) -> AuthManager: """ Create an AuthManager by name or fully-qualified class path. diff --git a/pyiceberg/catalog/rest/response.py b/pyiceberg/catalog/rest/response.py index d28a7c3f71..157e4bfa16 100644 --- a/pyiceberg/catalog/rest/response.py +++ b/pyiceberg/catalog/rest/response.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. from json import JSONDecodeError -from typing import Dict, Literal, Type +from typing import Literal from pydantic import Field, ValidationError from requests import HTTPError @@ -60,8 +60,8 @@ class OAuthErrorResponse(IcebergBaseModel): error_uri: str | None = None -def _handle_non_200_response(exc: HTTPError, error_handler: Dict[int, Type[Exception]]) -> None: - exception: Type[Exception] +def _handle_non_200_response(exc: HTTPError, error_handler: dict[int, type[Exception]]) -> None: + exception: type[Exception] if exc.response is None: raise ValueError("Did not receive a response") diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index cefb22b95b..2b6fa74517 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -17,9 +17,6 @@ from typing import ( TYPE_CHECKING, - List, - Set, - Tuple, Union, ) @@ -402,7 +399,7 @@ def rename_table(self, from_identifier: str | Identifier, to_identifier: str | I return self.load_table(to_identifier) def commit_table( - self, table: Table, requirements: Tuple[TableRequirement, ...], updates: Tuple[TableUpdate, ...] + self, table: Table, requirements: tuple[TableRequirement, ...], updates: tuple[TableUpdate, ...] ) -> CommitTableResponse: """Commit updates to a table. @@ -583,7 +580,7 @@ def drop_namespace(self, namespace: str | Identifier) -> None: ) session.commit() - def list_tables(self, namespace: str | Identifier) -> List[Identifier]: + def list_tables(self, namespace: str | Identifier) -> list[Identifier]: """List tables under the given namespace in the catalog. Args: @@ -604,7 +601,7 @@ def list_tables(self, namespace: str | Identifier) -> List[Identifier]: result = session.scalars(stmt) return [(Catalog.identifier_to_tuple(table.table_namespace) + (table.table_name,)) for table in result] - def list_namespaces(self, namespace: str | Identifier = ()) -> List[Identifier]: + def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. Args: @@ -669,7 +666,7 @@ def load_namespace_properties(self, namespace: str | Identifier) -> Properties: return {props.property_key: props.property_value for props in result} def update_namespace_properties( - self, namespace: str | Identifier, removals: Set[str] | None = None, updates: Properties = EMPTY_DICT + self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT ) -> PropertiesUpdateSummary: """Remove provided property keys and update properties for a namespace. @@ -724,7 +721,7 @@ def update_namespace_properties( session.commit() return properties_update_summary - def list_views(self, namespace: str | Identifier) -> List[Identifier]: + def list_views(self, namespace: str | Identifier) -> list[Identifier]: raise NotImplementedError def view_exists(self, identifier: str | Identifier) -> bool: diff --git a/pyiceberg/cli/console.py b/pyiceberg/cli/console.py index f3adf830b2..9baa813eff 100644 --- a/pyiceberg/cli/console.py +++ b/pyiceberg/cli/console.py @@ -15,13 +15,11 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=broad-except,redefined-builtin,redefined-outer-name +from collections.abc import Callable from functools import wraps from typing import ( Any, - Callable, - Dict, Literal, - Tuple, ) import click @@ -97,7 +95,7 @@ def run( ctx.exit(1) -def _catalog_and_output(ctx: Context) -> Tuple[Catalog, Output]: +def _catalog_and_output(ctx: Context) -> tuple[Catalog, Output]: """Small helper to set the types.""" return ctx.obj["catalog"], ctx.obj["output"] @@ -430,7 +428,7 @@ def list_refs(ctx: Context, identifier: str, type: str, verbose: bool) -> None: output.describe_refs(relevant_refs) -def _retention_properties(ref: SnapshotRef, table_properties: Dict[str, str]) -> Dict[str, str]: +def _retention_properties(ref: SnapshotRef, table_properties: dict[str, str]) -> dict[str, str]: retention_properties = {} if ref.snapshot_ref_type == SnapshotRefType.BRANCH: default_min_snapshots_to_keep = property_as_int( diff --git a/pyiceberg/cli/output.py b/pyiceberg/cli/output.py index b546877fac..332221008c 100644 --- a/pyiceberg/cli/output.py +++ b/pyiceberg/cli/output.py @@ -18,9 +18,6 @@ from abc import ABC, abstractmethod from typing import ( Any, - Dict, - List, - Tuple, ) from uuid import UUID @@ -43,7 +40,7 @@ class Output(ABC): def exception(self, ex: Exception) -> None: ... @abstractmethod - def identifiers(self, identifiers: List[Identifier]) -> None: ... + def identifiers(self, identifiers: list[Identifier]) -> None: ... @abstractmethod def describe_table(self, table: Table) -> None: ... @@ -70,7 +67,7 @@ def uuid(self, uuid: UUID | None) -> None: ... def version(self, version: str) -> None: ... @abstractmethod - def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: ... + def describe_refs(self, refs: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: ... class ConsoleOutput(Output): @@ -91,7 +88,7 @@ def exception(self, ex: Exception) -> None: else: Console(stderr=True).print(ex) - def identifiers(self, identifiers: List[Identifier]) -> None: + def identifiers(self, identifiers: list[Identifier]) -> None: table = self._table for identifier in identifiers: table.add_row(".".join(identifier)) @@ -174,7 +171,7 @@ def uuid(self, uuid: UUID | None) -> None: def version(self, version: str) -> None: Console().print(version) - def describe_refs(self, ref_details: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: + def describe_refs(self, ref_details: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: refs_table = RichTable(title="Snapshot Refs") refs_table.add_column("Ref") refs_table.add_column("Type") @@ -202,7 +199,7 @@ def _out(self, d: Any) -> None: def exception(self, ex: Exception) -> None: self._out({"type": ex.__class__.__name__, "message": str(ex)}) - def identifiers(self, identifiers: List[Identifier]) -> None: + def identifiers(self, identifiers: list[Identifier]) -> None: self._out([".".join(identifier) for identifier in identifiers]) def describe_table(self, table: Table) -> None: @@ -240,7 +237,7 @@ def uuid(self, uuid: UUID | None) -> None: def version(self, version: str) -> None: self._out({"version": version}) - def describe_refs(self, refs: List[Tuple[str, SnapshotRefType, Dict[str, str]]]) -> None: + def describe_refs(self, refs: list[tuple[str, SnapshotRefType, dict[str, str]]]) -> None: self._out( [ {"name": name, "type": type, detail_key: detail_val} diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index b4eaea1b8e..8739a1ab08 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -31,13 +31,13 @@ import codecs import uuid +from collections.abc import Callable from datetime import date, datetime, time from decimal import Decimal from functools import singledispatch from struct import Struct from typing import ( Any, - Callable, ) from pyiceberg.typedef import UTF8, L diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py index 330d22b1a4..14ee9328c1 100644 --- a/pyiceberg/expressions/__init__.py +++ b/pyiceberg/expressions/__init__.py @@ -18,44 +18,26 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable, Sequence from functools import cached_property -from typing import ( - Any, - Callable, - Generic, - Iterable, - Sequence, - Set, - Tuple, - Type, - TypeVar, -) +from typing import Any, TypeAlias from typing import Literal as TypingLiteral -from pydantic import ConfigDict, Field +from pydantic import ConfigDict, Field, SerializeAsAny, model_validator +from pydantic_core.core_schema import ValidatorFunctionWrapHandler -from pyiceberg.expressions.literals import ( - AboveMax, - BelowMin, - Literal, - literal, -) +from pyiceberg.expressions.literals import AboveMax, BelowMin, Literal, literal from pyiceberg.schema import Accessor, Schema -from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel, L, StructProtocol +from pyiceberg.typedef import IcebergBaseModel, IcebergRootModel, L, LiteralValue, StructProtocol from pyiceberg.types import DoubleType, FloatType, NestedField from pyiceberg.utils.singleton import Singleton -try: - from pydantic import ConfigDict -except ImportError: - ConfigDict = dict - -def _to_unbound_term(term: str | UnboundTerm[Any]) -> UnboundTerm[Any]: +def _to_unbound_term(term: str | UnboundTerm) -> UnboundTerm: return Reference(term) if isinstance(term, str) else term -def _to_literal_set(values: Iterable[L] | Iterable[Literal[L]]) -> Set[Literal[L]]: +def _to_literal_set(values: Iterable[L] | Iterable[Literal[L]]) -> set[Literal[L]]: return {_to_literal(v) for v in values} @@ -66,7 +48,7 @@ def _to_literal(value: L | Literal[L]) -> Literal[L]: return literal(value) -class BooleanExpression(ABC): +class BooleanExpression(IcebergBaseModel, ABC): """An expression that evaluates to a boolean.""" @abstractmethod @@ -87,6 +69,66 @@ def __or__(self, other: BooleanExpression) -> BooleanExpression: return Or(self, other) + @model_validator(mode="wrap") + @classmethod + def handle_primitive_type(cls, v: Any, handler: ValidatorFunctionWrapHandler) -> BooleanExpression: + """Apply custom deserialization logic before validation.""" + # Already a BooleanExpression? return as-is so we keep the concrete subclass. + if isinstance(v, BooleanExpression): + return v + + # Handle different input formats + if isinstance(v, bool): + return AlwaysTrue() if v is True else AlwaysFalse() + + if isinstance(v, dict) and (field_type := v.get("type")): + # Unary + if field_type == "is-null": + return IsNull(**v) + elif field_type == "not-null": + return NotNull(**v) + elif field_type == "is-nan": + return IsNaN(**v) + elif field_type == "not-nan": + return NotNaN(**v) + + # Literal + elif field_type == "lt": + return LessThan(**v) + elif field_type == "lt-eq": + return LessThanOrEqual(**v) + elif field_type == "gt": + return GreaterThan(**v) + elif field_type == "gt-eq": + return GreaterThanOrEqual(**v) + elif field_type == "eq": + return EqualTo(**v) + elif field_type == "not-eq": + return NotEqualTo(**v) + elif field_type == "starts-with": + return StartsWith(**v) + elif field_type == "not-starts-with": + return NotStartsWith(**v) + + # Set + elif field_type == "in": + return In(**v) + elif field_type == "not-in": + return NotIn(**v) + + # Other + elif field_type == "and": + return And(**v) + elif field_type == "or": + return Or(**v) + elif field_type == "not": + return Not(**v) + + return handler(v) + + +SerializableBooleanExpression: TypeAlias = SerializeAsAny["BooleanExpression"] + def _build_balanced_tree( operator_: Callable[[BooleanExpression, BooleanExpression], BooleanExpression], items: Sequence[BooleanExpression] @@ -127,41 +169,38 @@ def _build_balanced_tree( return operator_(left, right) -class Term(Generic[L], ABC): +class Term: """A simple expression that evaluates to a value.""" -class Bound(ABC): +class Bound: """Represents a bound value expression.""" -B = TypeVar("B") - - -class Unbound(Generic[B], ABC): +class Unbound(ABC): """Represents an unbound value expression.""" @abstractmethod - def bind(self, schema: Schema, case_sensitive: bool = True) -> B: ... + def bind(self, schema: Schema, case_sensitive: bool = True) -> Bound | BooleanExpression: ... @property @abstractmethod - def as_bound(self) -> Type[Bound]: ... + def as_bound(self) -> type[Bound]: ... -class BoundTerm(Term[L], Bound, ABC): +class BoundTerm(Term, Bound, ABC): """Represents a bound term.""" @abstractmethod - def ref(self) -> BoundReference[L]: + def ref(self) -> BoundReference: """Return the bound reference.""" @abstractmethod - def eval(self, struct: StructProtocol) -> L: # pylint: disable=W0613 + def eval(self, struct: StructProtocol) -> Any: # pylint: disable=W0613 """Return the value at the referenced field's position in an object that abides by the StructProtocol.""" -class BoundReference(BoundTerm[L]): +class BoundReference(BoundTerm): """A reference bound to a field in a schema. Args: @@ -176,7 +215,7 @@ def __init__(self, field: NestedField, accessor: Accessor): self.field = field self.accessor = accessor - def eval(self, struct: StructProtocol) -> L: + def eval(self, struct: StructProtocol) -> Any: """Return the value at the referenced field's position in an object that abides by the StructProtocol. Args: @@ -194,7 +233,7 @@ def __repr__(self) -> str: """Return the string representation of the BoundReference class.""" return f"BoundReference(field={repr(self.field)}, accessor={repr(self.accessor)})" - def ref(self) -> BoundReference[L]: + def ref(self) -> BoundReference: return self def __hash__(self) -> int: @@ -202,14 +241,14 @@ def __hash__(self) -> int: return hash(str(self)) -class UnboundTerm(Term[Any], Unbound[BoundTerm[L]], ABC): +class UnboundTerm(Term, Unbound, ABC): """Represents an unbound term.""" @abstractmethod - def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundTerm[L]: ... + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundTerm: ... -class Reference(UnboundTerm[Any], IcebergRootModel[str]): +class Reference(UnboundTerm, IcebergRootModel[str]): """A reference not yet bound to a field in a schema. Args: @@ -232,7 +271,7 @@ def __str__(self) -> str: """Return the string representation of the Reference class.""" return f"Reference(name={repr(self.root)})" - def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundReference[L]: + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundReference: """Bind the reference to an Iceberg schema. Args: @@ -247,24 +286,31 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundReference[L] """ field = schema.find_field(name_or_id=self.name, case_sensitive=case_sensitive) accessor = schema.accessor_for_field(field.field_id) - return self.as_bound(field=field, accessor=accessor) # type: ignore + return self.as_bound(field=field, accessor=accessor) @property def name(self) -> str: return self.root @property - def as_bound(self) -> Type[BoundReference[L]]: - return BoundReference[L] + def as_bound(self) -> type[BoundReference]: + return BoundReference class And(BooleanExpression): """AND operation expression - logical conjunction.""" - left: BooleanExpression - right: BooleanExpression + model_config = ConfigDict(arbitrary_types_allowed=True) + + type: TypingLiteral["and"] = Field(default="and", alias="type") + left: SerializableBooleanExpression = Field() + right: SerializableBooleanExpression = Field() + + def __init__(self, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression, **_: Any) -> None: + if isinstance(self, And) and not hasattr(self, "left") and not hasattr(self, "right"): + super().__init__(left=left, right=right) - def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore + def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression, **_: Any) -> BooleanExpression: if rest: return _build_balanced_tree(And, (left, right, *rest)) if left is AlwaysFalse() or right is AlwaysFalse(): @@ -274,10 +320,7 @@ def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: Boole elif right is AlwaysTrue(): return left else: - obj = super().__new__(cls) - obj.left = left - obj.right = right - return obj + return super().__new__(cls) def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the And class.""" @@ -296,25 +339,25 @@ def __invert__(self) -> BooleanExpression: # De Morgan's law: not (A and B) = (not A) or (not B) return Or(~self.left, ~self.right) - def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression, BooleanExpression]: """Pickle the And class.""" return (self.left, self.right) -class Or(IcebergBaseModel, BooleanExpression): +class Or(BooleanExpression): """OR operation expression - logical disjunction.""" model_config = ConfigDict(arbitrary_types_allowed=True) type: TypingLiteral["or"] = Field(default="or", alias="type") - left: BooleanExpression - right: BooleanExpression + left: SerializableBooleanExpression = Field() + right: SerializableBooleanExpression = Field() - def __init__(self, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> None: + def __init__(self, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression, **_: Any) -> None: if isinstance(self, Or) and not hasattr(self, "left") and not hasattr(self, "right"): super().__init__(left=left, right=right) - def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore + def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression, **_: Any) -> BooleanExpression: if rest: return _build_balanced_tree(Or, (left, right, *rest)) if left is AlwaysTrue() or right is AlwaysTrue(): @@ -324,8 +367,7 @@ def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: Boole elif right is AlwaysFalse(): return left else: - obj = super().__new__(cls) - return obj + return super().__new__(cls) def __str__(self) -> str: """Return the string representation of the Or class.""" @@ -344,31 +386,31 @@ def __invert__(self) -> BooleanExpression: # De Morgan's law: not (A or B) = (not A) and (not B) return And(~self.left, ~self.right) - def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression, BooleanExpression]: """Pickle the Or class.""" return (self.left, self.right) -class Not(IcebergBaseModel, BooleanExpression): +class Not(BooleanExpression): """NOT operation expression - logical negation.""" model_config = ConfigDict(arbitrary_types_allowed=True) type: TypingLiteral["not"] = Field(default="not") - child: BooleanExpression = Field() + child: SerializableBooleanExpression = Field() def __init__(self, child: BooleanExpression, **_: Any) -> None: super().__init__(child=child) - def __new__(cls, child: BooleanExpression, **_: Any) -> BooleanExpression: # type: ignore + def __new__(cls, child: BooleanExpression, **_: Any) -> BooleanExpression: if child is AlwaysTrue(): return AlwaysFalse() elif child is AlwaysFalse(): return AlwaysTrue() elif isinstance(child, Not): return child.child - obj = super().__new__(cls) - return obj + else: + return super().__new__(cls) def __str__(self) -> str: """Return the string representation of the Not class.""" @@ -386,15 +428,15 @@ def __invert__(self) -> BooleanExpression: """Transform the Expression into its negated version.""" return self.child - def __getnewargs__(self) -> Tuple[BooleanExpression]: + def __getnewargs__(self) -> tuple[BooleanExpression]: """Pickle the Not class.""" return (self.child,) -class AlwaysTrue(BooleanExpression, Singleton, IcebergRootModel[str]): +class AlwaysTrue(BooleanExpression, Singleton, IcebergRootModel[bool]): """TRUE expression.""" - root: str = "true" + root: bool = True def __invert__(self) -> AlwaysFalse: """Transform the Expression into its negated version.""" @@ -409,10 +451,10 @@ def __repr__(self) -> str: return "AlwaysTrue()" -class AlwaysFalse(BooleanExpression, Singleton, IcebergRootModel[str]): +class AlwaysFalse(BooleanExpression, Singleton, IcebergRootModel[bool]): """FALSE expression.""" - root: str = "false" + root: bool = False def __invert__(self) -> AlwaysTrue: """Transform the Expression into its negated version.""" @@ -427,11 +469,13 @@ def __repr__(self) -> str: return "AlwaysFalse()" -class BoundPredicate(Generic[L], Bound, BooleanExpression, ABC): - term: BoundTerm[L] +class BoundPredicate(Bound, BooleanExpression, ABC): + model_config = ConfigDict(arbitrary_types_allowed=True) + + term: BoundTerm - def __init__(self, term: BoundTerm[L]): - self.term = term + def __init__(self, term: BoundTerm, **kwargs: Any) -> None: + super().__init__(term=term, **kwargs) def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the BoundPredicate class.""" @@ -439,16 +483,22 @@ def __eq__(self, other: Any) -> bool: return self.term == other.term return False + def __str__(self) -> str: + """Return the string representation of the BoundPredicate class.""" + return f"{self.__class__.__name__}(term={str(self.term)})" + @property @abstractmethod - def as_unbound(self) -> Type[UnboundPredicate[Any]]: ... + def as_unbound(self) -> type[UnboundPredicate]: ... + +class UnboundPredicate(Unbound, BooleanExpression, ABC): + model_config = ConfigDict(arbitrary_types_allowed=True) -class UnboundPredicate(Generic[L], Unbound[BooleanExpression], BooleanExpression, ABC): - term: UnboundTerm[Any] + term: UnboundTerm - def __init__(self, term: str | UnboundTerm[Any]): - self.term = _to_unbound_term(term) + def __init__(self, term: str | UnboundTerm, **kwargs: Any) -> None: + super().__init__(term=_to_unbound_term(term), **kwargs) def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the UnboundPredicate class.""" @@ -459,15 +509,15 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BooleanExpression @property @abstractmethod - def as_bound(self) -> Type[BoundPredicate[L]]: ... + def as_bound(self) -> type[BoundPredicate]: ... -class UnaryPredicate(IcebergBaseModel, UnboundPredicate[Any], ABC): - type: str +class UnaryPredicate(UnboundPredicate, ABC): + type: TypingLiteral["is-null", "not-null", "is-nan", "not-nan"] = Field() model_config = {"arbitrary_types_allowed": True} - def __init__(self, term: str | UnboundTerm[Any]): + def __init__(self, term: str | UnboundTerm, **_: Any) -> None: unbound = _to_unbound_term(term) super().__init__(term=unbound) @@ -476,9 +526,10 @@ def __str__(self) -> str: # Sort to make it deterministic return f"{str(self.__class__.__name__)}(term={str(self.term)})" - def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundUnaryPredicate[Any]: + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundUnaryPredicate: bound_term = self.term.bind(schema, case_sensitive) - return self.as_bound(bound_term) + bound_type = self.as_bound + return bound_type(bound_term) # type: ignore[misc] def __repr__(self) -> str: """Return the string representation of the UnaryPredicate class.""" @@ -486,145 +537,155 @@ def __repr__(self) -> str: @property @abstractmethod - def as_bound(self) -> Type[BoundUnaryPredicate[Any]]: ... + def as_bound(self) -> type[BoundUnaryPredicate]: ... # type: ignore -class BoundUnaryPredicate(BoundPredicate[L], ABC): +class BoundUnaryPredicate(BoundPredicate, ABC): def __repr__(self) -> str: """Return the string representation of the BoundUnaryPredicate class.""" return f"{str(self.__class__.__name__)}(term={repr(self.term)})" @property @abstractmethod - def as_unbound(self) -> Type[UnaryPredicate]: ... + def as_unbound(self) -> type[UnaryPredicate]: ... - def __getnewargs__(self) -> Tuple[BoundTerm[L]]: + def __getnewargs__(self) -> tuple[BoundTerm]: """Pickle the BoundUnaryPredicate class.""" return (self.term,) -class BoundIsNull(BoundUnaryPredicate[L]): - def __new__(cls, term: BoundTerm[L]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 +class BoundIsNull(BoundUnaryPredicate): + def __new__(cls, term: BoundTerm) -> BooleanExpression: # pylint: disable=W0221 if term.ref().field.required: return AlwaysFalse() return super().__new__(cls) - def __invert__(self) -> BoundNotNull[L]: + def __invert__(self) -> BoundNotNull: """Transform the Expression into its negated version.""" return BoundNotNull(self.term) @property - def as_unbound(self) -> Type[IsNull]: + def as_unbound(self) -> type[IsNull]: return IsNull -class BoundNotNull(BoundUnaryPredicate[L]): - def __new__(cls, term: BoundTerm[L]): # type: ignore # pylint: disable=W0221 +class BoundNotNull(BoundUnaryPredicate): + def __new__(cls, term: BoundTerm) -> BooleanExpression: # pylint: disable=W0221 if term.ref().field.required: return AlwaysTrue() return super().__new__(cls) - def __invert__(self) -> BoundIsNull[L]: + def __invert__(self) -> BoundIsNull: """Transform the Expression into its negated version.""" return BoundIsNull(self.term) @property - def as_unbound(self) -> Type[NotNull]: + def as_unbound(self) -> type[NotNull]: return NotNull class IsNull(UnaryPredicate): - type: str = "is-null" + type: TypingLiteral["is-null"] = Field(default="is-null") def __invert__(self) -> NotNull: """Transform the Expression into its negated version.""" return NotNull(self.term) @property - def as_bound(self) -> Type[BoundIsNull[L]]: - return BoundIsNull[L] + def as_bound(self) -> type[BoundIsNull]: # type: ignore + return BoundIsNull class NotNull(UnaryPredicate): - type: str = "not-null" + type: TypingLiteral["not-null"] = Field(default="not-null") def __invert__(self) -> IsNull: """Transform the Expression into its negated version.""" return IsNull(self.term) @property - def as_bound(self) -> Type[BoundNotNull[L]]: - return BoundNotNull[L] + def as_bound(self) -> type[BoundNotNull]: # type: ignore + return BoundNotNull -class BoundIsNaN(BoundUnaryPredicate[L]): - def __new__(cls, term: BoundTerm[L]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 +class BoundIsNaN(BoundUnaryPredicate): + def __new__(cls, term: BoundTerm) -> BooleanExpression: # pylint: disable=W0221 bound_type = term.ref().field.field_type if isinstance(bound_type, (FloatType, DoubleType)): return super().__new__(cls) return AlwaysFalse() - def __invert__(self) -> BoundNotNaN[L]: + def __invert__(self) -> BoundNotNaN: """Transform the Expression into its negated version.""" return BoundNotNaN(self.term) @property - def as_unbound(self) -> Type[IsNaN]: + def as_unbound(self) -> type[IsNaN]: return IsNaN -class BoundNotNaN(BoundUnaryPredicate[L]): - def __new__(cls, term: BoundTerm[L]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 +class BoundNotNaN(BoundUnaryPredicate): + def __new__(cls, term: BoundTerm) -> BooleanExpression: # pylint: disable=W0221 bound_type = term.ref().field.field_type if isinstance(bound_type, (FloatType, DoubleType)): return super().__new__(cls) return AlwaysTrue() - def __invert__(self) -> BoundIsNaN[L]: + def __invert__(self) -> BoundIsNaN: """Transform the Expression into its negated version.""" return BoundIsNaN(self.term) @property - def as_unbound(self) -> Type[NotNaN]: + def as_unbound(self) -> type[NotNaN]: return NotNaN class IsNaN(UnaryPredicate): - type: str = "is-nan" + type: TypingLiteral["is-nan"] = Field(default="is-nan") def __invert__(self) -> NotNaN: """Transform the Expression into its negated version.""" return NotNaN(self.term) @property - def as_bound(self) -> Type[BoundIsNaN[L]]: - return BoundIsNaN[L] + def as_bound(self) -> type[BoundIsNaN]: # type: ignore + return BoundIsNaN class NotNaN(UnaryPredicate): - type: str = "not-nan" + type: TypingLiteral["not-nan"] = Field(default="not-nan") def __invert__(self) -> IsNaN: """Transform the Expression into its negated version.""" return IsNaN(self.term) @property - def as_bound(self) -> Type[BoundNotNaN[L]]: - return BoundNotNaN[L] + def as_bound(self) -> type[BoundNotNaN]: # type: ignore + return BoundNotNaN -class SetPredicate(IcebergBaseModel, UnboundPredicate[L], ABC): +class SetPredicate(UnboundPredicate, ABC): model_config = ConfigDict(arbitrary_types_allowed=True) type: TypingLiteral["in", "not-in"] = Field(default="in") - literals: Set[Literal[L]] = Field(alias="items") + literals: set[LiteralValue] = Field(alias="values") + + def __init__( + self, term: str | UnboundTerm, literals: Iterable[Any] | Iterable[LiteralValue] | None = None, **kwargs: Any + ) -> None: + if literals is None and "values" in kwargs: + literals = kwargs["values"] - def __init__(self, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]]): - super().__init__(term=_to_unbound_term(term), items=_to_literal_set(literals)) # type: ignore + if literals is None: + literal_set: set[LiteralValue] = set() + else: + literal_set = _to_literal_set(literals) + super().__init__(term=_to_unbound_term(term), values=literal_set) - def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundSetPredicate[L]: + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundSetPredicate: bound_term = self.term.bind(schema, case_sensitive) - return self.as_bound(bound_term, {lit.to(bound_term.ref().field.field_type) for lit in self.literals}) + literal_set = self.literals + return self.as_bound(bound_term, {lit.to(bound_term.ref().field.field_type) for lit in literal_set}) # type: ignore def __str__(self) -> str: """Return the string representation of the SetPredicate class.""" @@ -640,26 +701,25 @@ def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the SetPredicate class.""" return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False - def __getnewargs__(self) -> Tuple[UnboundTerm[L], Set[Literal[L]]]: + def __getnewargs__(self) -> tuple[UnboundTerm, set[Any]]: """Pickle the SetPredicate class.""" return (self.term, self.literals) @property @abstractmethod - def as_bound(self) -> Type[BoundSetPredicate[L]]: - return BoundSetPredicate[L] + def as_bound(self) -> type[BoundSetPredicate]: # type: ignore + return BoundSetPredicate -class BoundSetPredicate(BoundPredicate[L], ABC): - literals: Set[Literal[L]] +class BoundSetPredicate(BoundPredicate, ABC): + literals: set[LiteralValue] - def __init__(self, term: BoundTerm[L], literals: Set[Literal[L]]): - # Since we don't know the type of BoundPredicate[L], we have to ignore this one - super().__init__(term) # type: ignore - self.literals = _to_literal_set(literals) # pylint: disable=W0621 + def __init__(self, term: BoundTerm, literals: set[LiteralValue]) -> None: + literal_set = _to_literal_set(literals) + super().__init__(term=term, literals=literal_set) @cached_property - def value_set(self) -> Set[L]: + def value_set(self) -> set[Any]: return {lit.value for lit in self.literals} def __str__(self) -> str: @@ -676,17 +736,17 @@ def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the BoundSetPredicate class.""" return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False - def __getnewargs__(self) -> Tuple[BoundTerm[L], Set[Literal[L]]]: + def __getnewargs__(self) -> tuple[BoundTerm, set[LiteralValue]]: """Pickle the BoundSetPredicate class.""" return (self.term, self.literals) @property @abstractmethod - def as_unbound(self) -> Type[SetPredicate[L]]: ... + def as_unbound(self) -> type[SetPredicate]: ... -class BoundIn(BoundSetPredicate[L]): - def __new__(cls, term: BoundTerm[L], literals: Set[Literal[L]]) -> BooleanExpression: # type: ignore # pylint: disable=W0221 +class BoundIn(BoundSetPredicate): + def __new__(cls, term: BoundTerm, literals: set[LiteralValue]) -> BooleanExpression: # pylint: disable=W0221 count = len(literals) if count == 0: return AlwaysFalse() @@ -695,7 +755,7 @@ def __new__(cls, term: BoundTerm[L], literals: Set[Literal[L]]) -> BooleanExpres else: return super().__new__(cls) - def __invert__(self) -> BoundNotIn[L]: + def __invert__(self) -> BoundNotIn: """Transform the Expression into its negated version.""" return BoundNotIn(self.term, self.literals) @@ -704,15 +764,15 @@ def __eq__(self, other: Any) -> bool: return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False @property - def as_unbound(self) -> Type[In[L]]: + def as_unbound(self) -> type[In]: return In -class BoundNotIn(BoundSetPredicate[L]): - def __new__( # type: ignore # pylint: disable=W0221 +class BoundNotIn(BoundSetPredicate): + def __new__( # pylint: disable=W0221 cls, - term: BoundTerm[L], - literals: Set[Literal[L]], + term: BoundTerm, + literals: set[LiteralValue], ) -> BooleanExpression: count = len(literals) if count == 0: @@ -722,46 +782,58 @@ def __new__( # type: ignore # pylint: disable=W0221 else: return super().__new__(cls) - def __invert__(self) -> BoundIn[L]: + def __invert__(self) -> BoundIn: """Transform the Expression into its negated version.""" return BoundIn(self.term, self.literals) @property - def as_unbound(self) -> Type[NotIn[L]]: + def as_unbound(self) -> type[NotIn]: return NotIn -class In(SetPredicate[L]): +class In(SetPredicate): type: TypingLiteral["in"] = Field(default="in", alias="type") - def __new__( # type: ignore # pylint: disable=W0221 - cls, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]] - ) -> BooleanExpression: - literals_set: Set[Literal[L]] = _to_literal_set(literals) + def __new__( # pylint: disable=W0221 + cls, term: str | UnboundTerm, literals: Iterable[Any] | Iterable[LiteralValue] | None = None, **kwargs: Any + ) -> In: + if literals is None and "values" in kwargs: + literals = kwargs["values"] + + if literals is None: + literals_set: set[LiteralValue] = set() + else: + literals_set = _to_literal_set(literals) count = len(literals_set) if count == 0: return AlwaysFalse() elif count == 1: - return EqualTo(term, next(iter(literals))) + return EqualTo(term, next(iter(literals_set))) else: return super().__new__(cls) - def __invert__(self) -> NotIn[L]: + def __invert__(self) -> NotIn: """Transform the Expression into its negated version.""" - return NotIn[L](self.term, self.literals) + return NotIn(self.term, self.literals) @property - def as_bound(self) -> Type[BoundIn[L]]: - return BoundIn[L] + def as_bound(self) -> type[BoundIn]: # type: ignore + return BoundIn -class NotIn(SetPredicate[L], ABC): +class NotIn(SetPredicate, ABC): type: TypingLiteral["not-in"] = Field(default="not-in", alias="type") - def __new__( # type: ignore # pylint: disable=W0221 - cls, term: str | UnboundTerm[Any], literals: Iterable[L] | Iterable[Literal[L]] - ) -> BooleanExpression: - literals_set: Set[Literal[L]] = _to_literal_set(literals) + def __new__( # pylint: disable=W0221 + cls, term: str | UnboundTerm, literals: Iterable[Any] | Iterable[LiteralValue] | None = None, **kwargs: Any + ) -> NotIn: + if literals is None and "values" in kwargs: + literals = kwargs["values"] + + if literals is None: + literals_set: set[LiteralValue] = set() + else: + literals_set = _to_literal_set(literals) count = len(literals_set) if count == 0: return AlwaysTrue() @@ -770,29 +842,32 @@ def __new__( # type: ignore # pylint: disable=W0221 else: return super().__new__(cls) - def __invert__(self) -> In[L]: + def __invert__(self) -> In: """Transform the Expression into its negated version.""" - return In[L](self.term, self.literals) + return In(self.term, self.literals) @property - def as_bound(self) -> Type[BoundNotIn[L]]: - return BoundNotIn[L] + def as_bound(self) -> type[BoundNotIn]: # type: ignore + return BoundNotIn -class LiteralPredicate(IcebergBaseModel, UnboundPredicate[L], ABC): +class LiteralPredicate(UnboundPredicate, ABC): type: TypingLiteral["lt", "lt-eq", "gt", "gt-eq", "eq", "not-eq", "starts-with", "not-starts-with"] = Field(alias="type") - term: UnboundTerm[Any] - value: Literal[L] = Field() + term: UnboundTerm + value: LiteralValue = Field() model_config = ConfigDict(populate_by_name=True, frozen=True, arbitrary_types_allowed=True) - def __init__(self, term: str | UnboundTerm[Any], literal: L | Literal[L]): - super().__init__(term=_to_unbound_term(term), value=_to_literal(literal)) # type: ignore[call-arg] + def __init__(self, term: str | UnboundTerm, literal: Any | None = None, **kwargs: Any) -> None: + if literal is None and "value" in kwargs: + literal = kwargs["value"] + + super().__init__(term=_to_unbound_term(term), value=_to_literal(literal)) @property - def literal(self) -> Literal[L]: + def literal(self) -> LiteralValue: return self.value - def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundLiteralPredicate[L]: + def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundLiteralPredicate: bound_term = self.term.bind(schema, case_sensitive) lit = self.literal.to(bound_term.ref().field.field_type) @@ -807,7 +882,7 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundLiteralPredi elif isinstance(self, (LessThan, LessThanOrEqual, EqualTo)): return AlwaysFalse() - return self.as_bound(bound_term, lit) + return self.as_bound(bound_term, lit) # type: ignore def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the LiteralPredicate class.""" @@ -825,16 +900,14 @@ def __repr__(self) -> str: @property @abstractmethod - def as_bound(self) -> Type[BoundLiteralPredicate[L]]: ... + def as_bound(self) -> type[BoundLiteralPredicate]: ... # type: ignore -class BoundLiteralPredicate(BoundPredicate[L], ABC): - literal: Literal[L] +class BoundLiteralPredicate(BoundPredicate, ABC): + literal: LiteralValue - def __init__(self, term: BoundTerm[L], literal: Literal[L]): # pylint: disable=W0621 - # Since we don't know the type of BoundPredicate[L], we have to ignore this one - super().__init__(term) # type: ignore - self.literal = literal # pylint: disable=W0621 + def __init__(self, term: BoundTerm, literal: LiteralValue): # pylint: disable=W0621 + super().__init__(term=term, literal=literal) def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the BoundLiteralPredicate class.""" @@ -842,186 +915,190 @@ def __eq__(self, other: Any) -> bool: return self.term == other.term and self.literal == other.literal return False + def __str__(self) -> str: + """Return the string representation of the BoundLiteralPredicate class.""" + return f"{self.__class__.__name__}(term={str(self.term)}, literal={repr(self.literal)})" + def __repr__(self) -> str: """Return the string representation of the BoundLiteralPredicate class.""" return f"{str(self.__class__.__name__)}(term={repr(self.term)}, literal={repr(self.literal)})" @property @abstractmethod - def as_unbound(self) -> Type[LiteralPredicate[L]]: ... + def as_unbound(self) -> type[LiteralPredicate]: ... -class BoundEqualTo(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundNotEqualTo[L]: +class BoundEqualTo(BoundLiteralPredicate): + def __invert__(self) -> BoundNotEqualTo: """Transform the Expression into its negated version.""" - return BoundNotEqualTo[L](self.term, self.literal) + return BoundNotEqualTo(self.term, self.literal) @property - def as_unbound(self) -> Type[EqualTo[L]]: + def as_unbound(self) -> type[EqualTo]: return EqualTo -class BoundNotEqualTo(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundEqualTo[L]: +class BoundNotEqualTo(BoundLiteralPredicate): + def __invert__(self) -> BoundEqualTo: """Transform the Expression into its negated version.""" - return BoundEqualTo[L](self.term, self.literal) + return BoundEqualTo(self.term, self.literal) @property - def as_unbound(self) -> Type[NotEqualTo[L]]: + def as_unbound(self) -> type[NotEqualTo]: return NotEqualTo -class BoundGreaterThanOrEqual(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundLessThan[L]: +class BoundGreaterThanOrEqual(BoundLiteralPredicate): + def __invert__(self) -> BoundLessThan: """Transform the Expression into its negated version.""" - return BoundLessThan[L](self.term, self.literal) + return BoundLessThan(self.term, self.literal) @property - def as_unbound(self) -> Type[GreaterThanOrEqual[L]]: - return GreaterThanOrEqual[L] + def as_unbound(self) -> type[GreaterThanOrEqual]: + return GreaterThanOrEqual -class BoundGreaterThan(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundLessThanOrEqual[L]: +class BoundGreaterThan(BoundLiteralPredicate): + def __invert__(self) -> BoundLessThanOrEqual: """Transform the Expression into its negated version.""" return BoundLessThanOrEqual(self.term, self.literal) @property - def as_unbound(self) -> Type[GreaterThan[L]]: - return GreaterThan[L] + def as_unbound(self) -> type[GreaterThan]: + return GreaterThan -class BoundLessThan(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundGreaterThanOrEqual[L]: +class BoundLessThan(BoundLiteralPredicate): + def __invert__(self) -> BoundGreaterThanOrEqual: """Transform the Expression into its negated version.""" - return BoundGreaterThanOrEqual[L](self.term, self.literal) + return BoundGreaterThanOrEqual(self.term, self.literal) @property - def as_unbound(self) -> Type[LessThan[L]]: - return LessThan[L] + def as_unbound(self) -> type[LessThan]: + return LessThan -class BoundLessThanOrEqual(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundGreaterThan[L]: +class BoundLessThanOrEqual(BoundLiteralPredicate): + def __invert__(self) -> BoundGreaterThan: """Transform the Expression into its negated version.""" - return BoundGreaterThan[L](self.term, self.literal) + return BoundGreaterThan(self.term, self.literal) @property - def as_unbound(self) -> Type[LessThanOrEqual[L]]: - return LessThanOrEqual[L] + def as_unbound(self) -> type[LessThanOrEqual]: + return LessThanOrEqual -class BoundStartsWith(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundNotStartsWith[L]: +class BoundStartsWith(BoundLiteralPredicate): + def __invert__(self) -> BoundNotStartsWith: """Transform the Expression into its negated version.""" - return BoundNotStartsWith[L](self.term, self.literal) + return BoundNotStartsWith(self.term, self.literal) @property - def as_unbound(self) -> Type[StartsWith[L]]: - return StartsWith[L] + def as_unbound(self) -> type[StartsWith]: + return StartsWith -class BoundNotStartsWith(BoundLiteralPredicate[L]): - def __invert__(self) -> BoundStartsWith[L]: +class BoundNotStartsWith(BoundLiteralPredicate): + def __invert__(self) -> BoundStartsWith: """Transform the Expression into its negated version.""" - return BoundStartsWith[L](self.term, self.literal) + return BoundStartsWith(self.term, self.literal) @property - def as_unbound(self) -> Type[NotStartsWith[L]]: - return NotStartsWith[L] + def as_unbound(self) -> type[NotStartsWith]: + return NotStartsWith -class EqualTo(LiteralPredicate[L]): +class EqualTo(LiteralPredicate): type: TypingLiteral["eq"] = Field(default="eq", alias="type") - def __invert__(self) -> NotEqualTo[L]: + def __invert__(self) -> NotEqualTo: """Transform the Expression into its negated version.""" - return NotEqualTo[L](self.term, self.literal) + return NotEqualTo(self.term, self.literal) @property - def as_bound(self) -> Type[BoundEqualTo[L]]: - return BoundEqualTo[L] + def as_bound(self) -> type[BoundEqualTo]: # type: ignore + return BoundEqualTo -class NotEqualTo(LiteralPredicate[L]): +class NotEqualTo(LiteralPredicate): type: TypingLiteral["not-eq"] = Field(default="not-eq", alias="type") - def __invert__(self) -> EqualTo[L]: + def __invert__(self) -> EqualTo: """Transform the Expression into its negated version.""" - return EqualTo[L](self.term, self.literal) + return EqualTo(self.term, self.literal) @property - def as_bound(self) -> Type[BoundNotEqualTo[L]]: - return BoundNotEqualTo[L] + def as_bound(self) -> type[BoundNotEqualTo]: # type: ignore + return BoundNotEqualTo -class LessThan(LiteralPredicate[L]): +class LessThan(LiteralPredicate): type: TypingLiteral["lt"] = Field(default="lt", alias="type") - def __invert__(self) -> GreaterThanOrEqual[L]: + def __invert__(self) -> GreaterThanOrEqual: """Transform the Expression into its negated version.""" - return GreaterThanOrEqual[L](self.term, self.literal) + return GreaterThanOrEqual(self.term, self.literal) @property - def as_bound(self) -> Type[BoundLessThan[L]]: - return BoundLessThan[L] + def as_bound(self) -> type[BoundLessThan]: # type: ignore + return BoundLessThan -class GreaterThanOrEqual(LiteralPredicate[L]): +class GreaterThanOrEqual(LiteralPredicate): type: TypingLiteral["gt-eq"] = Field(default="gt-eq", alias="type") - def __invert__(self) -> LessThan[L]: + def __invert__(self) -> LessThan: """Transform the Expression into its negated version.""" - return LessThan[L](self.term, self.literal) + return LessThan(self.term, self.literal) @property - def as_bound(self) -> Type[BoundGreaterThanOrEqual[L]]: - return BoundGreaterThanOrEqual[L] + def as_bound(self) -> type[BoundGreaterThanOrEqual]: # type: ignore + return BoundGreaterThanOrEqual -class GreaterThan(LiteralPredicate[L]): +class GreaterThan(LiteralPredicate): type: TypingLiteral["gt"] = Field(default="gt", alias="type") - def __invert__(self) -> LessThanOrEqual[L]: + def __invert__(self) -> LessThanOrEqual: """Transform the Expression into its negated version.""" - return LessThanOrEqual[L](self.term, self.literal) + return LessThanOrEqual(self.term, self.literal) @property - def as_bound(self) -> Type[BoundGreaterThan[L]]: - return BoundGreaterThan[L] + def as_bound(self) -> type[BoundGreaterThan]: # type: ignore + return BoundGreaterThan -class LessThanOrEqual(LiteralPredicate[L]): +class LessThanOrEqual(LiteralPredicate): type: TypingLiteral["lt-eq"] = Field(default="lt-eq", alias="type") - def __invert__(self) -> GreaterThan[L]: + def __invert__(self) -> GreaterThan: """Transform the Expression into its negated version.""" - return GreaterThan[L](self.term, self.literal) + return GreaterThan(self.term, self.literal) @property - def as_bound(self) -> Type[BoundLessThanOrEqual[L]]: - return BoundLessThanOrEqual[L] + def as_bound(self) -> type[BoundLessThanOrEqual]: # type: ignore + return BoundLessThanOrEqual -class StartsWith(LiteralPredicate[L]): +class StartsWith(LiteralPredicate): type: TypingLiteral["starts-with"] = Field(default="starts-with", alias="type") - def __invert__(self) -> NotStartsWith[L]: + def __invert__(self) -> NotStartsWith: """Transform the Expression into its negated version.""" - return NotStartsWith[L](self.term, self.literal) + return NotStartsWith(self.term, self.literal) @property - def as_bound(self) -> Type[BoundStartsWith[L]]: - return BoundStartsWith[L] + def as_bound(self) -> type[BoundStartsWith]: # type: ignore + return BoundStartsWith -class NotStartsWith(LiteralPredicate[L]): +class NotStartsWith(LiteralPredicate): type: TypingLiteral["not-starts-with"] = Field(default="not-starts-with", alias="type") - def __invert__(self) -> StartsWith[L]: + def __invert__(self) -> StartsWith: """Transform the Expression into its negated version.""" - return StartsWith[L](self.term, self.literal) + return StartsWith(self.term, self.literal) @property - def as_bound(self) -> Type[BoundNotStartsWith[L]]: - return BoundNotStartsWith[L] + def as_bound(self) -> type[BoundNotStartsWith]: # type: ignore + return BoundNotStartsWith diff --git a/pyiceberg/expressions/literals.py b/pyiceberg/expressions/literals.py index 0847f19c84..5bf70990b9 100644 --- a/pyiceberg/expressions/literals.py +++ b/pyiceberg/expressions/literals.py @@ -27,7 +27,7 @@ from decimal import ROUND_HALF_UP, Decimal from functools import singledispatchmethod from math import isnan -from typing import Any, Generic, Type +from typing import Any, Generic from uuid import UUID from pydantic import Field, model_serializer @@ -73,7 +73,7 @@ class Literal(IcebergRootModel[L], Generic[L], ABC): # type: ignore root: L = Field() - def __init__(self, value: L, value_type: Type[L], /, **data): # type: ignore + def __init__(self, value: L, value_type: type[L], /, **data): # type: ignore if value is None: raise TypeError("Invalid literal value: None") diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index ee8d1e930a..e4ab3befa3 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -16,16 +16,12 @@ # under the License. import math from abc import ABC, abstractmethod +from collections.abc import Callable from functools import singledispatch from typing import ( Any, - Callable, - Dict, Generic, - List, - Set, SupportsFloat, - Tuple, TypeVar, ) @@ -58,11 +54,10 @@ Or, UnboundPredicate, ) -from pyiceberg.expressions.literals import Literal from pyiceberg.manifest import DataFile, ManifestFile, PartitionFieldSummary from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.typedef import EMPTY_DICT, L, Record, StructProtocol +from pyiceberg.typedef import EMPTY_DICT, L, LiteralValue, Record, StructProtocol from pyiceberg.types import ( DoubleType, FloatType, @@ -120,19 +115,19 @@ def visit_or(self, left_result: T, right_result: T) -> T: """ @abstractmethod - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> T: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> T: """Visit method for an unbound predicate in an expression tree. Args: - predicate (UnboundPredicate[L): An instance of an UnboundPredicate. + predicate (UnboundPredicate): An instance of an UnboundPredicate. """ @abstractmethod - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> T: + def visit_bound_predicate(self, predicate: BoundPredicate) -> T: """Visit method for a bound predicate in an expression tree. Args: - predicate (BoundPredicate[L]): An instance of a BoundPredicate. + predicate (BoundPredicate): An instance of a BoundPredicate. """ @@ -180,13 +175,13 @@ def _(obj: And, visitor: BooleanExpressionVisitor[T]) -> T: @visit.register(UnboundPredicate) -def _(obj: UnboundPredicate[L], visitor: BooleanExpressionVisitor[T]) -> T: +def _(obj: UnboundPredicate, visitor: BooleanExpressionVisitor[T]) -> T: """Visit an unbound boolean expression with a concrete BooleanExpressionVisitor.""" return visitor.visit_unbound_predicate(predicate=obj) @visit.register(BoundPredicate) -def _(obj: BoundPredicate[L], visitor: BooleanExpressionVisitor[T]) -> T: +def _(obj: BoundPredicate, visitor: BooleanExpressionVisitor[T]) -> T: """Visit a bound boolean expression with a concrete BooleanExpressionVisitor.""" return visitor.visit_bound_predicate(predicate=obj) @@ -246,60 +241,60 @@ def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpress def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression: return Or(left=left_result, right=right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> BooleanExpression: return predicate.bind(self.schema, case_sensitive=self.case_sensitive) - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: raise TypeError(f"Found already bound predicate: {predicate}") class BoundBooleanExpressionVisitor(BooleanExpressionVisitor[T], ABC): @abstractmethod - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> T: + def visit_in(self, term: BoundTerm, literals: set[L]) -> T: """Visit a bound In predicate.""" @abstractmethod - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> T: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> T: """Visit a bound NotIn predicate.""" @abstractmethod - def visit_is_nan(self, term: BoundTerm[L]) -> T: + def visit_is_nan(self, term: BoundTerm) -> T: """Visit a bound IsNan predicate.""" @abstractmethod - def visit_not_nan(self, term: BoundTerm[L]) -> T: + def visit_not_nan(self, term: BoundTerm) -> T: """Visit a bound NotNan predicate.""" @abstractmethod - def visit_is_null(self, term: BoundTerm[L]) -> T: + def visit_is_null(self, term: BoundTerm) -> T: """Visit a bound IsNull predicate.""" @abstractmethod - def visit_not_null(self, term: BoundTerm[L]) -> T: + def visit_not_null(self, term: BoundTerm) -> T: """Visit a bound NotNull predicate.""" @abstractmethod - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound Equal predicate.""" @abstractmethod - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound NotEqual predicate.""" @abstractmethod - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound GreaterThanOrEqual predicate.""" @abstractmethod - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound GreaterThan predicate.""" @abstractmethod - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound LessThan predicate.""" @abstractmethod - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit a bound LessThanOrEqual predicate.""" @abstractmethod @@ -323,105 +318,105 @@ def visit_or(self, left_result: T, right_result: T) -> T: """Visit a bound Or predicate.""" @abstractmethod - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit bound StartsWith predicate.""" @abstractmethod - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> T: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> T: """Visit bound NotStartsWith predicate.""" - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> T: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> T: """Visit an unbound predicate. Args: - predicate (UnboundPredicate[L]): An unbound predicate. + predicate (UnboundPredicate): An unbound predicate. Raises: TypeError: This always raises since an unbound predicate is not expected in a bound boolean expression. """ raise TypeError(f"Not a bound predicate: {predicate}") - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> T: + def visit_bound_predicate(self, predicate: BoundPredicate) -> T: """Visit a bound predicate. Args: - predicate (BoundPredicate[L]): A bound predicate. + predicate (BoundPredicate): A bound predicate. """ return visit_bound_predicate(predicate, self) @singledispatch -def visit_bound_predicate(expr: BoundPredicate[L], _: BooleanExpressionVisitor[T]) -> T: +def visit_bound_predicate(expr: BoundPredicate, _: BooleanExpressionVisitor[T]) -> T: raise TypeError(f"Unknown predicate: {expr}") @visit_bound_predicate.register(BoundIn) -def _(expr: BoundIn[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundIn, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_in(term=expr.term, literals=expr.value_set) @visit_bound_predicate.register(BoundNotIn) -def _(expr: BoundNotIn[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundNotIn, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_not_in(term=expr.term, literals=expr.value_set) @visit_bound_predicate.register(BoundIsNaN) -def _(expr: BoundIsNaN[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundIsNaN, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_is_nan(term=expr.term) @visit_bound_predicate.register(BoundNotNaN) -def _(expr: BoundNotNaN[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundNotNaN, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_not_nan(term=expr.term) @visit_bound_predicate.register(BoundIsNull) -def _(expr: BoundIsNull[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundIsNull, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_is_null(term=expr.term) @visit_bound_predicate.register(BoundNotNull) -def _(expr: BoundNotNull[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundNotNull, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_not_null(term=expr.term) @visit_bound_predicate.register(BoundEqualTo) -def _(expr: BoundEqualTo[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundEqualTo, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_equal(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundNotEqualTo) -def _(expr: BoundNotEqualTo[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundNotEqualTo, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_not_equal(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundGreaterThanOrEqual) -def _(expr: BoundGreaterThanOrEqual[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundGreaterThanOrEqual, visitor: BoundBooleanExpressionVisitor[T]) -> T: """Visit a bound GreaterThanOrEqual predicate.""" return visitor.visit_greater_than_or_equal(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundGreaterThan) -def _(expr: BoundGreaterThan[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundGreaterThan, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_greater_than(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundLessThan) -def _(expr: BoundLessThan[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundLessThan, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_less_than(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundLessThanOrEqual) -def _(expr: BoundLessThanOrEqual[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundLessThanOrEqual, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_less_than_or_equal(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundStartsWith) -def _(expr: BoundStartsWith[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundStartsWith, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_starts_with(term=expr.term, literal=expr.literal) @visit_bound_predicate.register(BoundNotStartsWith) -def _(expr: BoundNotStartsWith[L], visitor: BoundBooleanExpressionVisitor[T]) -> T: +def _(expr: BoundNotStartsWith, visitor: BoundBooleanExpressionVisitor[T]) -> T: return visitor.visit_not_starts_with(term=expr.term, literal=expr.literal) @@ -447,10 +442,10 @@ def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpress def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression: return Or(left=left_result, right=right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> BooleanExpression: return predicate - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: return predicate @@ -469,53 +464,53 @@ def eval(self, struct: StructProtocol) -> bool: self.struct = struct return visit(self.bound, self) - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: return term.eval(self.struct) in literals - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool: return term.eval(self.struct) not in literals - def visit_is_nan(self, term: BoundTerm[L]) -> bool: + def visit_is_nan(self, term: BoundTerm) -> bool: val = term.eval(self.struct) return val != val - def visit_not_nan(self, term: BoundTerm[L]) -> bool: + def visit_not_nan(self, term: BoundTerm) -> bool: val = term.eval(self.struct) return val == val - def visit_is_null(self, term: BoundTerm[L]) -> bool: + def visit_is_null(self, term: BoundTerm) -> bool: return term.eval(self.struct) is None - def visit_not_null(self, term: BoundTerm[L]) -> bool: + def visit_not_null(self, term: BoundTerm) -> bool: return term.eval(self.struct) is not None - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: return term.eval(self.struct) == literal.value - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: return term.eval(self.struct) != literal.value - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: value = term.eval(self.struct) return value is not None and value >= literal.value - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: value = term.eval(self.struct) return value is not None and value > literal.value - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: value = term.eval(self.struct) return value is not None and value < literal.value - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: value = term.eval(self.struct) return value is not None and value <= literal.value - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: eval_res = term.eval(self.struct) return eval_res is not None and str(eval_res).startswith(str(literal.value)) - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: return not self.visit_starts_with(term, literal) def visit_true(self) -> bool: @@ -548,7 +543,7 @@ def _from_byte_buffer(field_type: IcebergType, val: bytes) -> Any: class _ManifestEvalVisitor(BoundBooleanExpressionVisitor[bool]): - partition_fields: List[PartitionFieldSummary] + partition_fields: list[PartitionFieldSummary] partition_filter: BooleanExpression def __init__(self, partition_struct_schema: Schema, partition_filter: BooleanExpression, case_sensitive: bool) -> None: @@ -562,7 +557,7 @@ def eval(self, manifest: ManifestFile) -> bool: # No partition information return ROWS_MIGHT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -584,12 +579,12 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool: # because the bounds are not necessarily a min or max value, this cannot be answered using # them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. return ROWS_MIGHT_MATCH - def visit_is_nan(self, term: BoundTerm[L]) -> bool: + def visit_is_nan(self, term: BoundTerm) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -598,7 +593,7 @@ def visit_is_nan(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_nan(self, term: BoundTerm[L]) -> bool: + def visit_not_nan(self, term: BoundTerm) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -607,7 +602,7 @@ def visit_not_nan(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_is_null(self, term: BoundTerm[L]) -> bool: + def visit_is_null(self, term: BoundTerm) -> bool: pos = term.ref().accessor.position if self.partition_fields[pos].contains_null is False: @@ -615,7 +610,7 @@ def visit_is_null(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_null(self, term: BoundTerm[L]) -> bool: + def visit_not_null(self, term: BoundTerm) -> bool: pos = term.ref().accessor.position # contains_null encodes whether at least one partition value is null, @@ -632,7 +627,7 @@ def visit_not_null(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -652,12 +647,12 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # because the bounds are not necessarily a min or max value, this cannot be answered using # them. notEq(col, X) with (X, Y) doesn't guarantee that X is a value in col. return ROWS_MIGHT_MATCH - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -671,7 +666,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) - return ROWS_MIGHT_MATCH - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -685,7 +680,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -699,7 +694,7 @@ def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] @@ -713,7 +708,7 @@ def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> b return ROWS_MIGHT_MATCH - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] prefix = str(literal.value) @@ -737,7 +732,7 @@ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: pos = term.ref().accessor.position field = self.partition_fields[pos] prefix = str(literal.value) @@ -824,12 +819,12 @@ def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpress def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression: return Or(left_result, right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> BooleanExpression: raise ValueError(f"Cannot project unbound predicate: {predicate}") class InclusiveProjection(ProjectionEvaluator): - def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: parts = self.spec.fields_by_source_id(predicate.term.ref().field.field_id) result: BooleanExpression = AlwaysTrue() @@ -869,9 +864,9 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]): file_schema: Schema case_sensitive: bool - projected_field_values: Dict[int, Any] + projected_field_values: dict[int, Any] - def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT) -> None: + def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: dict[int, Any] = EMPTY_DICT) -> None: self.file_schema = file_schema self.case_sensitive = case_sensitive self.projected_field_values = projected_field_values @@ -891,10 +886,10 @@ def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpress def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression: return Or(left=left_result, right=right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> BooleanExpression: raise TypeError(f"Expected Bound Predicate, got: {predicate.term}") - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: field = predicate.term.ref().field field_id = field.field_id file_column_name = self.file_schema.find_column_name(field_id) @@ -935,53 +930,53 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi def translate_column_names( - expr: BooleanExpression, file_schema: Schema, case_sensitive: bool = True, projected_field_values: Dict[int, Any] = EMPTY_DICT + expr: BooleanExpression, file_schema: Schema, case_sensitive: bool = True, projected_field_values: dict[int, Any] = EMPTY_DICT ) -> BooleanExpression: return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive, projected_field_values)) -class _ExpressionFieldIDs(BooleanExpressionVisitor[Set[int]]): +class _ExpressionFieldIDs(BooleanExpressionVisitor[set[int]]): """Extracts the field IDs used in the BooleanExpression.""" - def visit_true(self) -> Set[int]: + def visit_true(self) -> set[int]: return set() - def visit_false(self) -> Set[int]: + def visit_false(self) -> set[int]: return set() - def visit_not(self, child_result: Set[int]) -> Set[int]: + def visit_not(self, child_result: set[int]) -> set[int]: return child_result - def visit_and(self, left_result: Set[int], right_result: Set[int]) -> Set[int]: + def visit_and(self, left_result: set[int], right_result: set[int]) -> set[int]: return left_result.union(right_result) - def visit_or(self, left_result: Set[int], right_result: Set[int]) -> Set[int]: + def visit_or(self, left_result: set[int], right_result: set[int]) -> set[int]: return left_result.union(right_result) - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> Set[int]: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> set[int]: raise ValueError("Only works on bound records") - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> Set[int]: + def visit_bound_predicate(self, predicate: BoundPredicate) -> set[int]: return {predicate.term.ref().field.field_id} -def extract_field_ids(expr: BooleanExpression) -> Set[int]: +def extract_field_ids(expr: BooleanExpression) -> set[int]: return visit(expr, _ExpressionFieldIDs()) -class _RewriteToDNF(BooleanExpressionVisitor[Tuple[BooleanExpression, ...]]): - def visit_true(self) -> Tuple[BooleanExpression, ...]: +class _RewriteToDNF(BooleanExpressionVisitor[tuple[BooleanExpression, ...]]): + def visit_true(self) -> tuple[BooleanExpression, ...]: return (AlwaysTrue(),) - def visit_false(self) -> Tuple[BooleanExpression, ...]: + def visit_false(self) -> tuple[BooleanExpression, ...]: return (AlwaysFalse(),) - def visit_not(self, child_result: Tuple[BooleanExpression, ...]) -> Tuple[BooleanExpression, ...]: + def visit_not(self, child_result: tuple[BooleanExpression, ...]) -> tuple[BooleanExpression, ...]: raise ValueError(f"Not expressions are not allowed: {child_result}") def visit_and( - self, left_result: Tuple[BooleanExpression, ...], right_result: Tuple[BooleanExpression, ...] - ) -> Tuple[BooleanExpression, ...]: + self, left_result: tuple[BooleanExpression, ...], right_result: tuple[BooleanExpression, ...] + ) -> tuple[BooleanExpression, ...]: # Distributive law: # ((P OR Q) AND (R OR S)) AND (((P AND R) OR (P AND S)) OR ((Q AND R) OR ((Q AND S))) # A AND (B OR C) = (A AND B) OR (A AND C) @@ -989,31 +984,31 @@ def visit_and( return tuple(And(le, re) for le in left_result for re in right_result) def visit_or( - self, left_result: Tuple[BooleanExpression, ...], right_result: Tuple[BooleanExpression, ...] - ) -> Tuple[BooleanExpression, ...]: + self, left_result: tuple[BooleanExpression, ...], right_result: tuple[BooleanExpression, ...] + ) -> tuple[BooleanExpression, ...]: return left_result + right_result - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> Tuple[BooleanExpression, ...]: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> tuple[BooleanExpression, ...]: return (predicate,) - def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> Tuple[BooleanExpression, ...]: + def visit_bound_predicate(self, predicate: BoundPredicate) -> tuple[BooleanExpression, ...]: return (predicate,) -def rewrite_to_dnf(expr: BooleanExpression) -> Tuple[BooleanExpression, ...]: +def rewrite_to_dnf(expr: BooleanExpression) -> tuple[BooleanExpression, ...]: # Rewrites an arbitrary boolean expression to disjunctive normal form (DNF): # (A AND NOT(B) AND C) OR (NOT(D) AND E AND F) OR (G) expr_without_not = rewrite_not(expr) return visit(expr_without_not, _RewriteToDNF()) -class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[List[Tuple[str, str, Any]]]): +class ExpressionToPlainFormat(BoundBooleanExpressionVisitor[list[tuple[str, str, Any]]]): cast_int_to_date: bool def __init__(self, cast_int_to_date: bool = False) -> None: self.cast_int_to_date = cast_int_to_date - def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | Set[L]) -> L | Set[L]: + def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | set[L]) -> L | set[L]: if self.cast_int_to_date: iceberg_type_class = type(iceberg_type) conversions = {TimestampType: micros_to_timestamp, TimestamptzType: micros_to_timestamptz} @@ -1025,73 +1020,73 @@ def _cast_if_necessary(self, iceberg_type: IcebergType, literal: L | Set[L]) -> return conversion_function(literal) # type: ignore return literal - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + def visit_in(self, term: BoundTerm, literals: set[L]) -> list[tuple[str, str, Any]]: field = term.ref().field return [(term.ref().field.name, "in", self._cast_if_necessary(field.field_type, literals))] - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> List[Tuple[str, str, Any]]: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> list[tuple[str, str, Any]]: field = term.ref().field return [(field.name, "not in", self._cast_if_necessary(field.field_type, literals))] - def visit_is_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_is_nan(self, term: BoundTerm) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", float("nan"))] - def visit_not_nan(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_not_nan(self, term: BoundTerm) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", float("nan"))] - def visit_is_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_is_null(self, term: BoundTerm) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", None)] - def visit_not_null(self, term: BoundTerm[L]) -> List[Tuple[str, str, Any]]: + def visit_not_null(self, term: BoundTerm) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", None)] - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "==", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "!=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, ">=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, ">", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "<", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [(term.ref().field.name, "<=", self._cast_if_necessary(term.ref().field.field_type, literal.value))] - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [] - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> List[Tuple[str, str, Any]]: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> list[tuple[str, str, Any]]: return [] - def visit_true(self) -> List[Tuple[str, str, Any]]: + def visit_true(self) -> list[tuple[str, str, Any]]: return [] # Not supported - def visit_false(self) -> List[Tuple[str, str, Any]]: + def visit_false(self) -> list[tuple[str, str, Any]]: raise ValueError("Not supported: AlwaysFalse") - def visit_not(self, child_result: List[Tuple[str, str, Any]]) -> List[Tuple[str, str, Any]]: + def visit_not(self, child_result: list[tuple[str, str, Any]]) -> list[tuple[str, str, Any]]: raise ValueError(f"Not allowed: {child_result}") def visit_and( - self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] - ) -> List[Tuple[str, str, Any]]: + self, left_result: list[tuple[str, str, Any]], right_result: list[tuple[str, str, Any]] + ) -> list[tuple[str, str, Any]]: return left_result + right_result def visit_or( - self, left_result: List[Tuple[str, str, Any]], right_result: List[Tuple[str, str, Any]] - ) -> List[Tuple[str, str, Any]]: + self, left_result: list[tuple[str, str, Any]], right_result: list[tuple[str, str, Any]] + ) -> list[tuple[str, str, Any]]: raise ValueError(f"Not allowed: {left_result} || {right_result}") def expression_to_plain_format( - expressions: Tuple[BooleanExpression, ...], cast_int_to_datetime: bool = False -) -> List[List[Tuple[str, str, Any]]]: + expressions: tuple[BooleanExpression, ...], cast_int_to_datetime: bool = False +) -> list[list[tuple[str, str, Any]]]: """Format a Disjunctive Normal Form expression. These are the formats that the expression can be fed into: @@ -1117,11 +1112,11 @@ def expression_to_plain_format( class _MetricsEvaluator(BoundBooleanExpressionVisitor[bool], ABC): - value_counts: Dict[int, int] - null_counts: Dict[int, int] - nan_counts: Dict[int, int] - lower_bounds: Dict[int, bytes] - upper_bounds: Dict[int, bytes] + value_counts: dict[int, int] + null_counts: dict[int, int] + nan_counts: dict[int, int] + lower_bounds: dict[int, bytes] + upper_bounds: dict[int, bytes] def visit_true(self) -> bool: # all rows match @@ -1196,7 +1191,7 @@ def _contains_nans_only(self, field_id: int) -> bool: return nan_count == value_count return False - def visit_is_null(self, term: BoundTerm[L]) -> bool: + def visit_is_null(self, term: BoundTerm) -> bool: field_id = term.ref().field.field_id if self.null_counts.get(field_id) == 0: @@ -1204,7 +1199,7 @@ def visit_is_null(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_null(self, term: BoundTerm[L]) -> bool: + def visit_not_null(self, term: BoundTerm) -> bool: # no need to check whether the field is required because binding evaluates that case # if the column has no non-null values, the expression cannot match field_id = term.ref().field.field_id @@ -1214,7 +1209,7 @@ def visit_not_null(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_is_nan(self, term: BoundTerm[L]) -> bool: + def visit_is_nan(self, term: BoundTerm) -> bool: field_id = term.ref().field.field_id if self.nan_counts.get(field_id) == 0: @@ -1227,7 +1222,7 @@ def visit_is_nan(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_nan(self, term: BoundTerm[L]) -> bool: + def visit_not_nan(self, term: BoundTerm) -> bool: field_id = term.ref().field.field_id if self._contains_nans_only(field_id): @@ -1235,7 +1230,7 @@ def visit_not_nan(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id = field.field_id @@ -1252,12 +1247,12 @@ def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound >= literal.value: # type: ignore[operator] + if lower_bound >= literal.value: return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id = field.field_id @@ -1273,12 +1268,12 @@ def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> b # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound > literal.value: # type: ignore[operator] + if lower_bound > literal.value: return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id = field.field_id @@ -1290,7 +1285,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: if upper_bound_bytes := self.upper_bounds.get(field_id): upper_bound = from_bytes(field.field_type, upper_bound_bytes) - if upper_bound <= literal.value: # type: ignore[operator] + if upper_bound <= literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1299,7 +1294,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id = field.field_id @@ -1311,7 +1306,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) - if upper_bound_bytes := self.upper_bounds.get(field_id): upper_bound = from_bytes(field.field_type, upper_bound_bytes) - if upper_bound < literal.value: # type: ignore[operator] + if upper_bound < literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1320,7 +1315,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) - return ROWS_MIGHT_MATCH - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id = field.field_id @@ -1336,7 +1331,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if lower_bound > literal.value: # type: ignore[operator] + if lower_bound > literal.value: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): @@ -1345,15 +1340,15 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - if upper_bound < literal.value: # type: ignore[operator] + if upper_bound < literal.value: return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: return ROWS_MIGHT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: field = term.ref().field field_id = field.field_id @@ -1389,12 +1384,12 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool: # because the bounds are not necessarily a min or max value, this cannot be answered using # them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a value in col. return ROWS_MIGHT_MATCH - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id: int = field.field_id @@ -1423,7 +1418,7 @@ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_MATCH - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: field = term.ref().field field_id: int = field.field_id @@ -1464,7 +1459,7 @@ def strict_projection( class StrictProjection(ProjectionEvaluator): - def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: parts = self.spec.fields_by_source_id(predicate.term.ref().field.field_id) result: BooleanExpression = AlwaysFalse() @@ -1515,7 +1510,7 @@ def eval(self, file: DataFile) -> bool: return visit(self.expr, self) - def visit_is_null(self, term: BoundTerm[L]) -> bool: + def visit_is_null(self, term: BoundTerm) -> bool: # no need to check whether the field is required because binding evaluates that case # if the column has any non-null values, the expression does not match field_id = term.ref().field.field_id @@ -1525,7 +1520,7 @@ def visit_is_null(self, term: BoundTerm[L]) -> bool: else: return ROWS_MIGHT_NOT_MATCH - def visit_not_null(self, term: BoundTerm[L]) -> bool: + def visit_not_null(self, term: BoundTerm) -> bool: # no need to check whether the field is required because binding evaluates that case # if the column has any non-null values, the expression does not match field_id = term.ref().field.field_id @@ -1535,7 +1530,7 @@ def visit_not_null(self, term: BoundTerm[L]) -> bool: else: return ROWS_MIGHT_NOT_MATCH - def visit_is_nan(self, term: BoundTerm[L]) -> bool: + def visit_is_nan(self, term: BoundTerm) -> bool: field_id = term.ref().field.field_id if self._contains_nans_only(field_id): @@ -1543,7 +1538,7 @@ def visit_is_nan(self, term: BoundTerm[L]) -> bool: else: return ROWS_MIGHT_NOT_MATCH - def visit_not_nan(self, term: BoundTerm[L]) -> bool: + def visit_not_nan(self, term: BoundTerm) -> bool: field_id = term.ref().field.field_id if (nan_count := self.nan_counts.get(field_id)) is not None and nan_count == 0: @@ -1554,7 +1549,7 @@ def visit_not_nan(self, term: BoundTerm[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when: <----------Min----Max---X-------> field_id = term.ref().field.field_id @@ -1571,7 +1566,7 @@ def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when: <----------Min----Max---X-------> field_id = term.ref().field.field_id @@ -1588,7 +1583,7 @@ def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> b return ROWS_MIGHT_NOT_MATCH - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when: <-------X---Min----Max----------> field_id = term.ref().field.field_id @@ -1610,7 +1605,7 @@ def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when: <-------X---Min----Max----------> field_id = term.ref().field.field_id @@ -1631,7 +1626,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) - return ROWS_MIGHT_NOT_MATCH - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when Min == X == Max field_id = term.ref().field.field_id @@ -1650,7 +1645,7 @@ def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: # Rows must match when X < Min or Max < X because it is not in the range field_id = term.ref().field.field_id @@ -1678,7 +1673,7 @@ def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: field_id = term.ref().field.field_id if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): @@ -1707,7 +1702,7 @@ def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> bool: field_id = term.ref().field.field_id if self._can_contain_nulls(field_id) or self._can_contain_nans(field_id): @@ -1737,10 +1732,10 @@ def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: return ROWS_MIGHT_NOT_MATCH - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: return ROWS_MIGHT_NOT_MATCH def _get_field(self, field_id: int) -> NestedField: @@ -1803,94 +1798,94 @@ def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpress def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression: return Or(left_result, right_result) - def visit_is_null(self, term: BoundTerm[L]) -> BooleanExpression: + def visit_is_null(self, term: BoundTerm) -> BooleanExpression: if term.eval(self.struct) is None: return AlwaysTrue() else: return AlwaysFalse() - def visit_not_null(self, term: BoundTerm[L]) -> BooleanExpression: + def visit_not_null(self, term: BoundTerm) -> BooleanExpression: if term.eval(self.struct) is not None: return AlwaysTrue() else: return AlwaysFalse() - def visit_is_nan(self, term: BoundTerm[L]) -> BooleanExpression: + def visit_is_nan(self, term: BoundTerm) -> BooleanExpression: val = term.eval(self.struct) if isinstance(val, SupportsFloat) and math.isnan(val): return self.visit_true() else: return self.visit_false() - def visit_not_nan(self, term: BoundTerm[L]) -> BooleanExpression: + def visit_not_nan(self, term: BoundTerm) -> BooleanExpression: val = term.eval(self.struct) if isinstance(val, SupportsFloat) and not math.isnan(val): return self.visit_true() else: return self.visit_false() - def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) < literal.value: return self.visit_true() else: return self.visit_false() - def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) <= literal.value: return self.visit_true() else: return self.visit_false() - def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) > literal.value: return self.visit_true() else: return self.visit_false() - def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) >= literal.value: return self.visit_true() else: return self.visit_false() - def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) == literal.value: return self.visit_true() else: return self.visit_false() - def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_not_equal(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if term.eval(self.struct) != literal.value: return self.visit_true() else: return self.visit_false() - def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression: + def visit_in(self, term: BoundTerm, literals: set[L]) -> BooleanExpression: if term.eval(self.struct) in literals: return self.visit_true() else: return self.visit_false() - def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression: + def visit_not_in(self, term: BoundTerm, literals: set[L]) -> BooleanExpression: if term.eval(self.struct) not in literals: return self.visit_true() else: return self.visit_false() - def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: eval_res = term.eval(self.struct) if eval_res is not None and str(eval_res).startswith(str(literal.value)): return AlwaysTrue() else: return AlwaysFalse() - def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression: + def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> BooleanExpression: if not self.visit_starts_with(term, literal): return AlwaysTrue() else: return AlwaysFalse() - def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> BooleanExpression: + def visit_bound_predicate(self, predicate: BoundPredicate) -> BooleanExpression: """ If there is no strict projection or if it evaluates to false, then return the predicate. @@ -1944,7 +1939,7 @@ def struct_to_schema(struct: StructType) -> Schema: return predicate - def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> BooleanExpression: bound = predicate.bind(self.schema, case_sensitive=self.case_sensitive) if isinstance(bound, BoundPredicate): diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 1915afcd0b..c7109993c9 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -32,10 +32,7 @@ from io import SEEK_SET from types import TracebackType from typing import ( - Dict, - List, Protocol, - Type, runtime_checkable, ) from urllib.parse import urlparse @@ -126,7 +123,7 @@ def __enter__(self) -> InputStream: """Provide setup when opening an InputStream using a 'with' statement.""" @abstractmethod - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" @@ -149,7 +146,7 @@ def __enter__(self) -> OutputStream: """Provide setup when opening an OutputStream using a 'with' statement.""" @abstractmethod - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Perform cleanup when exiting the scope of a 'with' statement.""" @@ -298,7 +295,7 @@ def delete(self, location: str | InputFile | OutputFile) -> None: # Mappings from the Java FileIO impl to a Python one. The list is ordered by preference. # If an implementation isn't installed, it will fall back to the next one. -SCHEMA_TO_FILE_IO: Dict[str, List[str]] = { +SCHEMA_TO_FILE_IO: dict[str, list[str]] = { "s3": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3a": [ARROW_FILE_IO, FSSPEC_FILE_IO], "s3n": [ARROW_FILE_IO, FSSPEC_FILE_IO], @@ -340,7 +337,7 @@ def _infer_file_io_from_scheme(path: str, properties: Properties) -> FileIO | No if file_io := _import_file_io(file_io_path, properties): return file_io else: - warnings.warn(f"No preferred file implementation for scheme: {parsed_url.scheme}") + warnings.warn(f"No preferred file implementation for scheme: {parsed_url.scheme}", stacklevel=2) return None diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 8f2fcc4312..5898a22675 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -22,14 +22,12 @@ import logging import os import threading +from collections.abc import Callable from copy import copy from functools import lru_cache from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, - Type, ) from urllib.parse import urlparse @@ -148,7 +146,7 @@ def __call__(self, request: "AWSRequest", **_: Any) -> None: request.url = response_json["uri"] -SIGNERS: Dict[str, Type[S3RequestSigner]] = {"S3V4RestSigner": S3V4RestSigner} +SIGNERS: dict[str, type[S3RequestSigner]] = {"S3V4RestSigner": S3V4RestSigner} def _file(_: Properties) -> LocalFileSystem: @@ -166,7 +164,7 @@ def _s3(properties: Properties) -> AbstractFileSystem: "region_name": get_first_property_value(properties, S3_REGION, AWS_REGION), } config_kwargs = {} - register_events: Dict[str, Callable[[AWSRequest], None]] = {} + register_events: dict[str, Callable[[AWSRequest], None]] = {} if signer := properties.get(S3_SIGNER): logger.info("Loading signer %s", signer) @@ -455,13 +453,13 @@ def _get_fs(self, scheme: str) -> AbstractFileSystem: raise ValueError(f"No registered filesystem for scheme: {scheme}") return self._scheme_to_fs[scheme](self.properties) - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """Create a dictionary of the FsSpecFileIO fields used when pickling.""" fileio_copy = copy(self.__dict__) del fileio_copy["_thread_locals"] return fileio_copy - def __setstate__(self, state: Dict[str, Any]) -> None: + def __setstate__(self, state: dict[str, Any]) -> None: """Deserialize the state into a FsSpecFileIO instance.""" self.__dict__ = state self._thread_locals = threading.local() diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 5be4c5d241..d98e3fa713 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -25,6 +25,7 @@ from __future__ import annotations +import builtins import fnmatch import functools import importlib @@ -36,6 +37,7 @@ import uuid import warnings from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable, Iterator from copy import copy from dataclasses import dataclass from enum import Enum @@ -43,14 +45,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, Generic, - Iterable, - Iterator, - List, - Set, - Tuple, TypeVar, cast, ) @@ -201,6 +196,7 @@ PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id" # ORC field ID key for Iceberg field IDs in ORC metadata ORC_FIELD_ID_KEY = b"iceberg.id" +ORC_FIELD_REQUIRED_KEY = b"iceberg.required" PYARROW_FIELD_DOC_KEY = b"doc" LIST_ELEMENT_NAME = "element" MAP_KEY_NAME = "key" @@ -232,7 +228,7 @@ def _import_retry_strategy(impl: str) -> S3RetryStrategy | None: class_ = getattr(module, class_name) return class_() except (ModuleNotFoundError, AttributeError): - warnings.warn(f"Could not initialize S3 retry strategy: {impl}") + warnings.warn(f"Could not initialize S3 retry strategy: {impl}", stacklevel=2) return None @@ -392,7 +388,7 @@ def __init__(self, properties: Properties = EMPTY_DICT): super().__init__(properties=properties) @staticmethod - def parse_location(location: str, properties: Properties = EMPTY_DICT) -> Tuple[str, str, str]: + def parse_location(location: str, properties: Properties = EMPTY_DICT) -> tuple[str, str, str]: """Return (scheme, netloc, path) for the given location. Uses DEFAULT_SCHEME and DEFAULT_NETLOC if scheme/netloc are missing. @@ -434,7 +430,7 @@ def _initialize_fs(self, scheme: str, netloc: str | None = None) -> FileSystem: def _initialize_oss_fs(self) -> FileSystem: from pyarrow.fs import S3FileSystem - client_kwargs: Dict[str, Any] = { + client_kwargs: dict[str, Any] = { "endpoint_override": self.properties.get(S3_ENDPOINT), "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), @@ -481,7 +477,7 @@ def _initialize_s3_fs(self, netloc: str | None) -> FileSystem: else: bucket_region = provided_region - client_kwargs: Dict[str, Any] = { + client_kwargs: dict[str, Any] = { "endpoint_override": self.properties.get(S3_ENDPOINT), "access_key": get_first_property_value(self.properties, S3_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), "secret_key": get_first_property_value(self.properties, S3_SECRET_ACCESS_KEY, AWS_SECRET_ACCESS_KEY), @@ -530,7 +526,7 @@ def _initialize_azure_fs(self) -> FileSystem: from pyarrow.fs import AzureFileSystem - client_kwargs: Dict[str, str] = {} + client_kwargs: dict[str, str] = {} if account_name := self.properties.get(ADLS_ACCOUNT_NAME): client_kwargs["account_name"] = account_name @@ -576,7 +572,7 @@ def _initialize_azure_fs(self) -> FileSystem: def _initialize_hdfs_fs(self, scheme: str, netloc: str | None) -> FileSystem: from pyarrow.fs import HadoopFileSystem - hdfs_kwargs: Dict[str, Any] = {} + hdfs_kwargs: dict[str, Any] = {} if netloc: return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") if host := self.properties.get(HDFS_HOST): @@ -594,7 +590,7 @@ def _initialize_hdfs_fs(self, scheme: str, netloc: str | None) -> FileSystem: def _initialize_gcs_fs(self) -> FileSystem: from pyarrow.fs import GcsFileSystem - gcs_kwargs: Dict[str, Any] = {} + gcs_kwargs: dict[str, Any] = {} if access_token := self.properties.get(GCS_TOKEN): gcs_kwargs["access_token"] = access_token if expiration := self.properties.get(GCS_TOKEN_EXPIRES_AT_MS): @@ -674,13 +670,13 @@ def delete(self, location: str | InputFile | OutputFile) -> None: raise PermissionError(f"Cannot delete file, access denied: {location}") from e raise # pragma: no cover - If some other kind of OSError, raise the raw error - def __getstate__(self) -> Dict[str, Any]: + def __getstate__(self) -> dict[str, Any]: """Create a dictionary of the PyArrowFileIO fields used when pickling.""" fileio_copy = copy(self.__dict__) fileio_copy["fs_by_scheme"] = None return fileio_copy - def __setstate__(self, state: Dict[str, Any]) -> None: + def __setstate__(self, state: dict[str, Any]) -> None: """Deserialize the state into a PyArrowFileIO instance.""" self.__dict__ = state self.fs_by_scheme = lru_cache(self._initialize_fs) @@ -688,7 +684,7 @@ def __setstate__(self, state: Dict[str, Any]) -> None: def schema_to_pyarrow( schema: Schema | IcebergType, - metadata: Dict[bytes, bytes] = EMPTY_DICT, + metadata: dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat = FileFormat.PARQUET, ) -> pa.schema: @@ -696,10 +692,10 @@ def schema_to_pyarrow( class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]): - _metadata: Dict[bytes, bytes] + _metadata: dict[bytes, bytes] def __init__( - self, metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat | None = None + self, metadata: dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat | None = None ) -> None: self._metadata = metadata self._include_field_ids = include_field_ids @@ -708,7 +704,7 @@ def __init__( def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema: return pa.schema(list(struct_result), metadata=self._metadata) - def struct(self, _: StructType, field_results: List[pa.DataType]) -> pa.DataType: + def struct(self, _: StructType, field_results: builtins.list[pa.DataType]) -> pa.DataType: return pa.struct(field_results) def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: @@ -722,6 +718,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: else: # Default to Parquet for backward compatibility metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id) + if self._file_format == FileFormat.ORC: + metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower() return pa.field( name=field.name, @@ -820,7 +818,7 @@ class _ConvertToArrowExpression(BoundBooleanExpressionVisitor[pc.Expression]): def __init__(self, schema: Schema | None = None): self._schema = schema - def _get_field_name(self, term: BoundTerm[Any]) -> str | Tuple[str, ...]: + def _get_field_name(self, term: BoundTerm) -> str | tuple[str, ...]: """Get the field name or nested field path for a bound term. For nested struct fields, returns a tuple of field names (e.g., ("mazeMetadata", "run_id")). @@ -840,50 +838,50 @@ def _get_field_name(self, term: BoundTerm[Any]) -> str | Tuple[str, ...]: # Fallback to just the field name if schema is not available return term.ref().field.name - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: + def visit_in(self, term: BoundTerm, literals: set[Any]) -> pc.Expression: pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) return pc.field(self._get_field_name(term)).isin(pyarrow_literals) - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> pc.Expression: + def visit_not_in(self, term: BoundTerm, literals: set[Any]) -> pc.Expression: pyarrow_literals = pa.array(literals, type=schema_to_pyarrow(term.ref().field.field_type)) return ~pc.field(self._get_field_name(term)).isin(pyarrow_literals) - def visit_is_nan(self, term: BoundTerm[Any]) -> pc.Expression: + def visit_is_nan(self, term: BoundTerm) -> pc.Expression: ref = pc.field(self._get_field_name(term)) return pc.is_nan(ref) - def visit_not_nan(self, term: BoundTerm[Any]) -> pc.Expression: + def visit_not_nan(self, term: BoundTerm) -> pc.Expression: ref = pc.field(self._get_field_name(term)) return ~pc.is_nan(ref) - def visit_is_null(self, term: BoundTerm[Any]) -> pc.Expression: + def visit_is_null(self, term: BoundTerm) -> pc.Expression: return pc.field(self._get_field_name(term)).is_null(nan_is_null=False) - def visit_not_null(self, term: BoundTerm[Any]) -> pc.Expression: + def visit_not_null(self, term: BoundTerm) -> pc.Expression: return pc.field(self._get_field_name(term)).is_valid() - def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_equal(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) == _convert_scalar(literal.value, term.ref().field.field_type) - def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_not_equal(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) != _convert_scalar(literal.value, term.ref().field.field_type) - def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) >= _convert_scalar(literal.value, term.ref().field.field_type) - def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_greater_than(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) > _convert_scalar(literal.value, term.ref().field.field_type) - def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_less_than(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) < _convert_scalar(literal.value, term.ref().field.field_type) - def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_less_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.field(self._get_field_name(term)) <= _convert_scalar(literal.value, term.ref().field.field_type) - def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return pc.starts_with(pc.field(self._get_field_name(term)), literal.value) - def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> pc.Expression: + def visit_not_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> pc.Expression: return ~pc.starts_with(pc.field(self._get_field_name(term)), literal.value) def visit_true(self) -> pc.Expression: @@ -904,13 +902,13 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[None]): # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. - is_null_or_not_bound_terms: set[BoundTerm[Any]] + is_null_or_not_bound_terms: set[BoundTerm] # The remaining BoundTerms appearing in the boolean expr. - null_unmentioned_bound_terms: set[BoundTerm[Any]] + null_unmentioned_bound_terms: set[BoundTerm] # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. - is_nan_or_not_bound_terms: set[BoundTerm[Any]] + is_nan_or_not_bound_terms: set[BoundTerm] # The remaining BoundTerms appearing in the boolean expr. - nan_unmentioned_bound_terms: set[BoundTerm[Any]] + nan_unmentioned_bound_terms: set[BoundTerm] def __init__(self) -> None: super().__init__() @@ -919,81 +917,81 @@ def __init__(self) -> None: self.is_nan_or_not_bound_terms = set() self.nan_unmentioned_bound_terms = set() - def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + def _handle_explicit_is_null_or_not(self, term: BoundTerm) -> None: """Handle the predicate case where either is_null or is_not_null is included.""" if term in self.null_unmentioned_bound_terms: self.null_unmentioned_bound_terms.remove(term) self.is_null_or_not_bound_terms.add(term) - def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None: + def _handle_null_unmentioned(self, term: BoundTerm) -> None: """Handle the predicate case where neither is_null or is_not_null is included.""" if term not in self.is_null_or_not_bound_terms: self.null_unmentioned_bound_terms.add(term) - def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None: + def _handle_explicit_is_nan_or_not(self, term: BoundTerm) -> None: """Handle the predicate case where either is_nan or is_not_nan is included.""" if term in self.nan_unmentioned_bound_terms: self.nan_unmentioned_bound_terms.remove(term) self.is_nan_or_not_bound_terms.add(term) - def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None: + def _handle_nan_unmentioned(self, term: BoundTerm) -> None: """Handle the predicate case where neither is_nan or is_not_nan is included.""" if term not in self.is_nan_or_not_bound_terms: self.nan_unmentioned_bound_terms.add(term) - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None: + def visit_in(self, term: BoundTerm, literals: set[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> None: + def visit_not_in(self, term: BoundTerm, literals: set[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_is_nan(self, term: BoundTerm[Any]) -> None: + def visit_is_nan(self, term: BoundTerm) -> None: self._handle_null_unmentioned(term) self._handle_explicit_is_nan_or_not(term) - def visit_not_nan(self, term: BoundTerm[Any]) -> None: + def visit_not_nan(self, term: BoundTerm) -> None: self._handle_null_unmentioned(term) self._handle_explicit_is_nan_or_not(term) - def visit_is_null(self, term: BoundTerm[Any]) -> None: + def visit_is_null(self, term: BoundTerm) -> None: self._handle_explicit_is_null_or_not(term) self._handle_nan_unmentioned(term) - def visit_not_null(self, term: BoundTerm[Any]) -> None: + def visit_not_null(self, term: BoundTerm) -> None: self._handle_explicit_is_null_or_not(term) self._handle_nan_unmentioned(term) - def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_equal(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_not_equal(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_greater_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_greater_than(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_less_than(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_less_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) - def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + def visit_not_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> None: self._handle_null_unmentioned(term) self._handle_nan_unmentioned(term) @@ -1043,10 +1041,10 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: Schema collector.collect(expr) # Convert the set of terms to a sorted list so that layout of the expression to build is deterministic. - null_unmentioned_bound_terms: List[BoundTerm[Any]] = sorted( + null_unmentioned_bound_terms: list[BoundTerm] = sorted( collector.null_unmentioned_bound_terms, key=lambda term: term.ref().field.name ) - nan_unmentioned_bound_terms: List[BoundTerm[Any]] = sorted( + nan_unmentioned_bound_terms: list[BoundTerm] = sorted( collector.nan_unmentioned_bound_terms, key=lambda term: term.ref().field.name ) @@ -1059,7 +1057,7 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: Schema @lru_cache -def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.FileFormat: +def _get_file_format(file_format: FileFormat, **kwargs: dict[str, Any]) -> ds.FileFormat: if file_format == FileFormat.PARQUET: return ds.ParquetFileFormat(**kwargs) elif file_format == FileFormat.ORC: @@ -1070,7 +1068,7 @@ def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.Fi raise ValueError(f"Unsupported file format: {file_format}") -def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray]: +def _read_deletes(io: FileIO, data_file: DataFile) -> dict[str, pa.ChunkedArray]: if data_file.file_format == FileFormat.PARQUET: with io.new_input(data_file.file_path).open() as fi: delete_fragment = _get_file_format( @@ -1100,7 +1098,7 @@ def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray] raise ValueError(f"Delete file format not supported: {data_file.file_format}") -def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array: +def _combine_positional_deletes(positional_deletes: list[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array: if len(positional_deletes) == 1: all_chunks = positional_deletes[0] else: @@ -1271,7 +1269,7 @@ def schema(self, schema: pa.Schema, struct_result: T) -> T: """Visit a schema.""" @abstractmethod - def struct(self, struct: pa.StructType, field_results: List[T]) -> T: + def struct(self, struct: pa.StructType, field_results: builtins.list[T]) -> T: """Visit a struct.""" @abstractmethod @@ -1309,7 +1307,7 @@ class _HasIds(PyArrowSchemaVisitor[bool]): def schema(self, schema: pa.Schema, struct_result: bool) -> bool: return struct_result - def struct(self, struct: pa.StructType, field_results: List[bool]) -> bool: + def struct(self, struct: pa.StructType, field_results: builtins.list[bool]) -> bool: return all(field_results) def field(self, field: pa.Field, field_result: bool) -> bool: @@ -1334,7 +1332,7 @@ def primitive(self, primitive: pa.DataType) -> bool: class _ConvertToIceberg(PyArrowSchemaVisitor[IcebergType | Schema]): """Converts PyArrowSchema to Iceberg Schema. Applies the IDs from name_mapping if provided.""" - _field_names: List[str] + _field_names: builtins.list[str] def __init__( self, downcast_ns_timestamp_to_us: bool = False, format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION @@ -1352,7 +1350,7 @@ def _field_id(self, field: pa.Field) -> int: def schema(self, schema: pa.Schema, struct_result: StructType) -> Schema: return Schema(*struct_result.fields) - def struct(self, struct: pa.StructType, field_results: List[NestedField]) -> StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[NestedField]) -> StructType: return StructType(*field_results) def field(self, field: pa.Field, field_result: IcebergType) -> NestedField: @@ -1472,7 +1470,7 @@ class _ConvertToLargeTypes(PyArrowSchemaVisitor[IcebergType | pa.Schema]): def schema(self, schema: pa.Schema, struct_result: pa.StructType) -> pa.Schema: return pa.schema(struct_result) - def struct(self, struct: pa.StructType, field_results: List[pa.Field]) -> pa.StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[pa.Field]) -> pa.StructType: return pa.struct(field_results) def field(self, field: pa.Field, field_result: pa.DataType) -> pa.Field: @@ -1496,7 +1494,7 @@ class _ConvertToSmallTypes(PyArrowSchemaVisitor[IcebergType | pa.Schema]): def schema(self, schema: pa.Schema, struct_result: pa.StructType) -> pa.Schema: return pa.schema(struct_result) - def struct(self, struct: pa.StructType, field_results: List[pa.Field]) -> pa.StructType: + def struct(self, struct: pa.StructType, field_results: builtins.list[pa.Field]) -> pa.StructType: return pa.struct(field_results) def field(self, field: pa.Field, field_result: pa.DataType) -> pa.Field: @@ -1536,8 +1534,8 @@ def _get_column_projection_values( projected_schema: Schema, table_schema: Schema, partition_spec: PartitionSpec | None, - file_project_field_ids: Set[int], -) -> Dict[int, Any]: + file_project_field_ids: set[int], +) -> dict[int, Any]: """Apply Column Projection rules to File Schema.""" project_schema_diff = projected_schema.field_ids.difference(file_project_field_ids) if len(project_schema_diff) == 0 or partition_spec is None: @@ -1562,8 +1560,8 @@ def _task_to_record_batches( bound_row_filter: BooleanExpression, projected_schema: Schema, table_schema: Schema, - projected_field_ids: Set[int], - positional_deletes: List[ChunkedArray] | None, + projected_field_ids: set[int], + positional_deletes: list[ChunkedArray] | None, case_sensitive: bool, name_mapping: NameMapping | None = None, partition_spec: PartitionSpec | None = None, @@ -1644,12 +1642,12 @@ def _task_to_record_batches( ) -def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: - deletes_per_file: Dict[str, List[ChunkedArray]] = {} +def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> dict[str, list[ChunkedArray]]: + deletes_per_file: dict[str, list[ChunkedArray]] = {} unique_deletes = set(itertools.chain.from_iterable([task.delete_files for task in tasks])) if len(unique_deletes) > 0: executor = ExecutorFactory.get_or_create() - deletes_per_files: Iterator[Dict[str, ChunkedArray]] = executor.map( + deletes_per_files: Iterator[dict[str, ChunkedArray]] = executor.map( lambda args: _read_deletes(*args), [(io, delete_file) for delete_file in unique_deletes], ) @@ -1700,7 +1698,7 @@ def __init__( self._downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) @property - def _projected_field_ids(self) -> Set[int]: + def _projected_field_ids(self) -> set[int]: """Set of field IDs that should be projected from the data files.""" return { id @@ -1773,7 +1771,7 @@ def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.Record total_row_count = 0 executor = ExecutorFactory.get_or_create() - def batches_for_task(task: FileScanTask) -> List[pa.RecordBatch]: + def batches_for_task(task: FileScanTask) -> list[pa.RecordBatch]: # Materialize the iterator here to ensure execution happens within the executor. # Otherwise, the iterator would be lazily consumed later (in the main thread), # defeating the purpose of using executor.map. @@ -1797,7 +1795,7 @@ def batches_for_task(task: FileScanTask) -> List[pa.RecordBatch]: break def _record_batches_from_scan_tasks_and_deletes( - self, tasks: Iterable[FileScanTask], deletes_per_file: Dict[str, List[ChunkedArray]] + self, tasks: Iterable[FileScanTask], deletes_per_file: dict[str, list[ChunkedArray]] ) -> Iterator[pa.RecordBatch]: total_row_count = 0 for task in tasks: @@ -1833,7 +1831,7 @@ def _to_requested_schema( batch: pa.RecordBatch, downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False, - projected_missing_fields: Dict[int, Any] = EMPTY_DICT, + projected_missing_fields: dict[int, Any] = EMPTY_DICT, ) -> pa.RecordBatch: # We could reuse some of these visitors struct_array = visit_with_partner( @@ -1852,7 +1850,7 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None] _include_field_ids: bool _downcast_ns_timestamp_to_us: bool _use_large_types: bool | None - _projected_missing_fields: Dict[int, Any] + _projected_missing_fields: dict[int, Any] def __init__( self, @@ -1860,7 +1858,7 @@ def __init__( downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False, use_large_types: bool | None = None, - projected_missing_fields: Dict[int, Any] = EMPTY_DICT, + projected_missing_fields: dict[int, Any] = EMPTY_DICT, ) -> None: self._file_schema = file_schema self._include_field_ids = include_field_ids @@ -1935,11 +1933,13 @@ def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Fi def schema(self, schema: Schema, schema_partner: pa.Array | None, struct_result: pa.Array | None) -> pa.Array | None: return struct_result - def struct(self, struct: StructType, struct_array: pa.Array | None, field_results: List[pa.Array | None]) -> pa.Array | None: + def struct( + self, struct: StructType, struct_array: pa.Array | None, field_results: builtins.list[pa.Array | None] + ) -> pa.Array | None: if struct_array is None: return None - field_arrays: List[pa.Array] = [] - fields: List[pa.Field] = [] + field_arrays: list[pa.Array] = [] + fields: list[pa.Field] = [] for field, field_array in zip(struct.fields, field_results, strict=True): if field_array is not None: array = self._cast_if_needed(field, field_array) @@ -2048,7 +2048,7 @@ class PrimitiveToPhysicalType(SchemaVisitorPerPrimitiveType[str]): def schema(self, schema: Schema, struct_result: str) -> str: raise ValueError(f"Expected primitive-type, got: {schema}") - def struct(self, struct: StructType, field_results: List[str]) -> str: + def struct(self, struct: StructType, field_results: builtins.list[str]) -> str: raise ValueError(f"Expected primitive-type, got: {struct}") def field(self, field: NestedField, field_result: str) -> str: @@ -2236,13 +2236,13 @@ class StatisticsCollector: column_name: str -class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector]]): +class PyArrowStatisticsCollector(PreOrderSchemaVisitor[list[StatisticsCollector]]): _field_id: int = 0 _schema: Schema - _properties: Dict[str, str] + _properties: dict[str, str] _default_mode: str - def __init__(self, schema: Schema, properties: Dict[str, str]): + def __init__(self, schema: Schema, properties: dict[str, str]): from pyiceberg.table import TableProperties self._schema = schema @@ -2251,35 +2251,41 @@ def __init__(self, schema: Schema, properties: Dict[str, str]): TableProperties.DEFAULT_WRITE_METRICS_MODE, TableProperties.DEFAULT_WRITE_METRICS_MODE_DEFAULT ) - def schema(self, schema: Schema, struct_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def schema( + self, schema: Schema, struct_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: return struct_result() def struct( - self, struct: StructType, field_results: List[Callable[[], List[StatisticsCollector]]] - ) -> List[StatisticsCollector]: + self, struct: StructType, field_results: builtins.list[Callable[[], builtins.list[StatisticsCollector]]] + ) -> builtins.list[StatisticsCollector]: return list(itertools.chain(*[result() for result in field_results])) - def field(self, field: NestedField, field_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def field( + self, field: NestedField, field_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: self._field_id = field.field_id return field_result() - def list(self, list_type: ListType, element_result: Callable[[], List[StatisticsCollector]]) -> List[StatisticsCollector]: + def list( + self, list_type: ListType, element_result: Callable[[], builtins.list[StatisticsCollector]] + ) -> builtins.list[StatisticsCollector]: self._field_id = list_type.element_id return element_result() def map( self, map_type: MapType, - key_result: Callable[[], List[StatisticsCollector]], - value_result: Callable[[], List[StatisticsCollector]], - ) -> List[StatisticsCollector]: + key_result: Callable[[], builtins.list[StatisticsCollector]], + value_result: Callable[[], builtins.list[StatisticsCollector]], + ) -> builtins.list[StatisticsCollector]: self._field_id = map_type.key_id k = key_result() self._field_id = map_type.value_id v = value_result() return k + v - def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[StatisticsCollector]: from pyiceberg.table import TableProperties column_name = self._schema.find_column_name(self._field_id) @@ -2308,8 +2314,8 @@ def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: def compute_statistics_plan( schema: Schema, - table_properties: Dict[str, str], -) -> Dict[int, StatisticsCollector]: + table_properties: dict[str, str], +) -> dict[int, StatisticsCollector]: """ Compute the statistics plan for all columns. @@ -2325,7 +2331,7 @@ def compute_statistics_plan( used to set the mode for column metrics collection """ stats_cols = pre_order_visit(schema, PyArrowStatisticsCollector(schema, table_properties)) - result: Dict[int, StatisticsCollector] = {} + result: dict[int, StatisticsCollector] = {} for stats_col in stats_cols: result[stats_col.field_id] = stats_col return result @@ -2337,27 +2343,33 @@ class ID2ParquetPath: parquet_path: str -class ID2ParquetPathVisitor(PreOrderSchemaVisitor[List[ID2ParquetPath]]): +class ID2ParquetPathVisitor(PreOrderSchemaVisitor[list[ID2ParquetPath]]): _field_id: int = 0 - _path: List[str] + _path: builtins.list[str] def __init__(self) -> None: self._path = [] - def schema(self, schema: Schema, struct_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def schema(self, schema: Schema, struct_result: Callable[[], builtins.list[ID2ParquetPath]]) -> builtins.list[ID2ParquetPath]: return struct_result() - def struct(self, struct: StructType, field_results: List[Callable[[], List[ID2ParquetPath]]]) -> List[ID2ParquetPath]: + def struct( + self, struct: StructType, field_results: builtins.list[Callable[[], builtins.list[ID2ParquetPath]]] + ) -> builtins.list[ID2ParquetPath]: return list(itertools.chain(*[result() for result in field_results])) - def field(self, field: NestedField, field_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def field( + self, field: NestedField, field_result: Callable[[], builtins.list[ID2ParquetPath]] + ) -> builtins.list[ID2ParquetPath]: self._field_id = field.field_id self._path.append(field.name) result = field_result() self._path.pop() return result - def list(self, list_type: ListType, element_result: Callable[[], List[ID2ParquetPath]]) -> List[ID2ParquetPath]: + def list( + self, list_type: ListType, element_result: Callable[[], builtins.list[ID2ParquetPath]] + ) -> builtins.list[ID2ParquetPath]: self._field_id = list_type.element_id self._path.append("list.element") result = element_result() @@ -2367,9 +2379,9 @@ def list(self, list_type: ListType, element_result: Callable[[], List[ID2Parquet def map( self, map_type: MapType, - key_result: Callable[[], List[ID2ParquetPath]], - value_result: Callable[[], List[ID2ParquetPath]], - ) -> List[ID2ParquetPath]: + key_result: Callable[[], builtins.list[ID2ParquetPath]], + value_result: Callable[[], builtins.list[ID2ParquetPath]], + ) -> builtins.list[ID2ParquetPath]: self._field_id = map_type.key_id self._path.append("key_value.key") k = key_result() @@ -2380,13 +2392,13 @@ def map( self._path.pop() return k + v - def primitive(self, primitive: PrimitiveType) -> List[ID2ParquetPath]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[ID2ParquetPath]: return [ID2ParquetPath(field_id=self._field_id, parquet_path=".".join(self._path))] def parquet_path_to_id_mapping( schema: Schema, -) -> Dict[str, int]: +) -> dict[str, int]: """ Compute the mapping of parquet column path to Iceberg ID. @@ -2397,7 +2409,7 @@ def parquet_path_to_id_mapping( Args: schema (pyiceberg.schema.Schema): The current table schema. """ - result: Dict[str, int] = {} + result: dict[str, int] = {} for pair in pre_order_visit(schema, ID2ParquetPathVisitor()): result[pair.parquet_path] = pair.field_id return result @@ -2406,12 +2418,12 @@ def parquet_path_to_id_mapping( @dataclass(frozen=True) class DataFileStatistics: record_count: int - column_sizes: Dict[int, int] - value_counts: Dict[int, int] - null_value_counts: Dict[int, int] - nan_value_counts: Dict[int, int] - column_aggregates: Dict[int, StatsAggregator] - split_offsets: List[int] + column_sizes: dict[int, int] + value_counts: dict[int, int] + null_value_counts: dict[int, int] + nan_value_counts: dict[int, int] + column_aggregates: dict[int, StatsAggregator] + split_offsets: list[int] def _partition_value(self, partition_field: PartitionField, schema: Schema) -> Any: if partition_field.source_id not in self.column_aggregates: @@ -2451,7 +2463,7 @@ def _partition_value(self, partition_field: PartitionField, schema: Schema) -> A def partition(self, partition_spec: PartitionSpec, schema: Schema) -> Record: return Record(*[self._partition_value(field, schema) for field in partition_spec.fields]) - def to_serialized_dict(self) -> Dict[str, Any]: + def to_serialized_dict(self) -> dict[str, Any]: lower_bounds = {} upper_bounds = {} @@ -2476,8 +2488,8 @@ def to_serialized_dict(self) -> Dict[str, Any]: def data_file_statistics_from_parquet_metadata( parquet_metadata: pq.FileMetaData, - stats_columns: Dict[int, StatisticsCollector], - parquet_column_mapping: Dict[str, int], + stats_columns: dict[int, StatisticsCollector], + parquet_column_mapping: dict[str, int], ) -> DataFileStatistics: """ Compute and return DataFileStatistics that includes the following. @@ -2496,16 +2508,16 @@ def data_file_statistics_from_parquet_metadata( set the mode for column metrics collection parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID """ - column_sizes: Dict[int, int] = {} - value_counts: Dict[int, int] = {} - split_offsets: List[int] = [] + column_sizes: dict[int, int] = {} + value_counts: dict[int, int] = {} + split_offsets: list[int] = [] - null_value_counts: Dict[int, int] = {} - nan_value_counts: Dict[int, int] = {} + null_value_counts: dict[int, int] = {} + nan_value_counts: dict[int, int] = {} col_aggs = {} - invalidate_col: Set[int] = set() + invalidate_col: set[int] = set() for r in range(parquet_metadata.num_row_groups): # References: # https://github.com/apache/iceberg/blob/fc381a81a1fdb8f51a0637ca27cd30673bd7aad3/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java#L232 @@ -2665,7 +2677,7 @@ def write_parquet(task: WriteTask) -> DataFile: return iter(data_files) -def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[pa.RecordBatch]]: +def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[pa.RecordBatch]]: from pyiceberg.utils.bin_packing import PackingIterator avg_row_size_bytes = tbl.nbytes / tbl.num_rows @@ -2759,7 +2771,7 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa PYARROW_UNCOMPRESSED_CODEC = "none" -def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: +def _get_parquet_writer_kwargs(table_properties: Properties) -> dict[str, Any]: from pyiceberg.table import TableProperties for key_pattern in [ @@ -2768,7 +2780,7 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: f"{TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX}.*", ]: if unsupported_keys := fnmatch.filter(table_properties, key_pattern): - warnings.warn(f"Parquet writer option(s) {unsupported_keys} not implemented") + warnings.warn(f"Parquet writer option(s) {unsupported_keys} not implemented", stacklevel=2) compression_codec = table_properties.get(TableProperties.PARQUET_COMPRESSION, TableProperties.PARQUET_COMPRESSION_DEFAULT) compression_level = property_as_int( diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 3c211f5e94..ca2883884f 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -19,17 +19,13 @@ import math import threading from abc import ABC, abstractmethod +from collections.abc import Iterator from copy import copy from enum import Enum from types import TracebackType from typing import ( Any, - Dict, - Iterator, - List, Literal, - Tuple, - Type, ) from cachetools import LRUCache, cached @@ -111,7 +107,7 @@ def __repr__(self) -> str: return f"FileFormat.{self.name}" -DATA_FILE_TYPE: Dict[int, StructType] = { +DATA_FILE_TYPE: dict[int, StructType] = { 1: StructType( NestedField(field_id=100, name="file_path", field_type=StringType(), required=True, doc="Location URI with FS scheme"), NestedField( @@ -475,27 +471,27 @@ def file_size_in_bytes(self) -> int: return self._data[5] @property - def column_sizes(self) -> Dict[int, int]: + def column_sizes(self) -> dict[int, int]: return self._data[6] @property - def value_counts(self) -> Dict[int, int]: + def value_counts(self) -> dict[int, int]: return self._data[7] @property - def null_value_counts(self) -> Dict[int, int]: + def null_value_counts(self) -> dict[int, int]: return self._data[8] @property - def nan_value_counts(self) -> Dict[int, int]: + def nan_value_counts(self) -> dict[int, int]: return self._data[9] @property - def lower_bounds(self) -> Dict[int, bytes]: + def lower_bounds(self) -> dict[int, bytes]: return self._data[10] @property - def upper_bounds(self) -> Dict[int, bytes]: + def upper_bounds(self) -> dict[int, bytes]: return self._data[11] @property @@ -503,11 +499,11 @@ def key_metadata(self) -> bytes | None: return self._data[12] @property - def split_offsets(self) -> List[int] | None: + def split_offsets(self) -> list[int] | None: return self._data[13] @property - def equality_ids(self) -> List[int] | None: + def equality_ids(self) -> list[int] | None: return self._data[14] @property @@ -690,7 +686,7 @@ def update(self, value: Any) -> None: self._min = min(self._min, value) -def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partitions: List[Record]) -> List[PartitionFieldSummary]: +def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partitions: list[Record]) -> list[PartitionFieldSummary]: types = [field.field_type for field in spec.partition_type(schema).fields] field_stats = [PartitionFieldStats(field_type) for field_type in types] for partition_keys in partitions: @@ -702,7 +698,7 @@ def construct_partition_summaries(spec: PartitionSpec, schema: Schema, partition return [field.to_summary() for field in field_stats] -MANIFEST_LIST_FILE_SCHEMAS: Dict[int, Schema] = { +MANIFEST_LIST_FILE_SCHEMAS: dict[int, Schema] = { 1: Schema( NestedField(500, "manifest_path", StringType(), required=True, doc="Location URI with FS scheme"), NestedField(501, "manifest_length", LongType(), required=True), @@ -828,7 +824,7 @@ def deleted_rows_count(self) -> int | None: return self._data[12] @property - def partitions(self) -> List[PartitionFieldSummary] | None: + def partitions(self) -> list[PartitionFieldSummary] | None: return self._data[13] @property @@ -841,7 +837,7 @@ def has_added_files(self) -> bool: def has_existing_files(self) -> bool: return self.existing_files_count is None or self.existing_files_count > 0 - def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> List[ManifestEntry]: + def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> list[ManifestEntry]: """ Read the manifest entries from the manifest file. @@ -875,11 +871,11 @@ def __hash__(self) -> int: # Global cache for manifest lists -_manifest_cache: LRUCache[Any, Tuple[ManifestFile, ...]] = LRUCache(maxsize=128) +_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] = LRUCache(maxsize=128) @cached(cache=_manifest_cache, key=lambda io, manifest_list: hashkey(manifest_list), lock=threading.RLock()) -def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]: +def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]: """Read and cache manifests from the given manifest list, returning a tuple to prevent modification.""" file = io.new_input(manifest_list) return tuple(read_manifest_list(file)) @@ -957,7 +953,7 @@ class ManifestWriter(ABC): _deleted_files: int _deleted_rows: int _min_sequence_number: int | None - _partitions: List[Record] + _partitions: list[Record] _compression: AvroCompressionCodec def __init__( @@ -992,7 +988,7 @@ def __enter__(self) -> ManifestWriter: def __exit__( self, - exc_type: Type[BaseException] | None, + exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: @@ -1012,7 +1008,7 @@ def content(self) -> ManifestContent: ... def version(self) -> TableVersion: ... @property - def _meta(self) -> Dict[str, str]: + def _meta(self) -> dict[str, str]: return { "schema": self._schema.model_dump_json(), "partition-spec": to_json(self._spec.fields).decode("utf-8"), @@ -1167,7 +1163,7 @@ def version(self) -> TableVersion: return 2 @property - def _meta(self) -> Dict[str, str]: + def _meta(self) -> dict[str, str]: return { **super()._meta, "content": "data", @@ -1201,12 +1197,12 @@ def write_manifest( class ManifestListWriter(ABC): _format_version: TableVersion _output_file: OutputFile - _meta: Dict[str, str] - _manifest_files: List[ManifestFile] + _meta: dict[str, str] + _manifest_files: list[ManifestFile] _commit_snapshot_id: int _writer: AvroOutputFile[ManifestFile] - def __init__(self, format_version: TableVersion, output_file: OutputFile, meta: Dict[str, Any]): + def __init__(self, format_version: TableVersion, output_file: OutputFile, meta: dict[str, Any]): self._format_version = format_version self._output_file = output_file self._meta = meta @@ -1226,7 +1222,7 @@ def __enter__(self) -> ManifestListWriter: def __exit__( self, - exc_type: Type[BaseException] | None, + exc_type: type[BaseException] | None, exc_value: BaseException | None, traceback: TracebackType | None, ) -> None: @@ -1237,7 +1233,7 @@ def __exit__( @abstractmethod def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: ... - def add_manifests(self, manifest_files: List[ManifestFile]) -> ManifestListWriter: + def add_manifests(self, manifest_files: list[ManifestFile]) -> ManifestListWriter: self._writer.write_block([self.prepare_manifest(manifest_file) for manifest_file in manifest_files]) return self diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index bf8e4081fe..8bf2b817d9 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -21,7 +21,7 @@ from dataclasses import dataclass from datetime import date, datetime, time from functools import cached_property, singledispatch -from typing import Annotated, Any, Dict, Generic, List, Set, Tuple, TypeVar +from typing import Annotated, Any, Generic, TypeVar from urllib.parse import quote_plus from pydantic import ( @@ -56,6 +56,7 @@ TimestampType, TimestamptzType, TimeType, + UnknownType, UUIDType, ) from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, time_to_micros @@ -131,7 +132,7 @@ class PartitionSpec(IcebergBaseModel): """ spec_id: int = Field(alias="spec-id", default=INITIAL_PARTITION_SPEC_ID) - fields: Tuple[PartitionField, ...] = Field(default_factory=tuple) + fields: tuple[PartitionField, ...] = Field(default_factory=tuple) def __init__( self, @@ -181,15 +182,15 @@ def last_assigned_field_id(self) -> int: return PARTITION_FIELD_ID_START - 1 @cached_property - def source_id_to_fields_map(self) -> Dict[int, List[PartitionField]]: - source_id_to_fields_map: Dict[int, List[PartitionField]] = {} + def source_id_to_fields_map(self) -> dict[int, list[PartitionField]]: + source_id_to_fields_map: dict[int, list[PartitionField]] = {} for partition_field in self.fields: existing = source_id_to_fields_map.get(partition_field.source_id, []) existing.append(partition_field) source_id_to_fields_map[partition_field.source_id] = existing return source_id_to_fields_map - def fields_by_source_id(self, field_id: int) -> List[PartitionField]: + def fields_by_source_id(self, field_id: int) -> list[PartitionField]: return self.source_id_to_fields_map.get(field_id, []) def compatible_with(self, other: PartitionSpec) -> bool: @@ -222,11 +223,14 @@ def partition_type(self, schema: Schema) -> StructType: :return: A StructType that represents the PartitionSpec, with a NestedField for each PartitionField. """ nested_fields = [] + schema_ids = schema._lazy_id_to_field for field in self.fields: - source_type = schema.find_type(field.source_id) - result_type = field.transform.result_type(source_type) - required = schema.find_field(field.source_id).required - nested_fields.append(NestedField(field.field_id, field.name, result_type, required=required)) + if source_field := schema_ids.get(field.source_id): + result_type = field.transform.result_type(source_field.field_type) + nested_fields.append(NestedField(field.field_id, field.name, result_type, required=source_field.required)) + else: + # Since the source field has been drop we cannot determine the type + nested_fields.append(NestedField(field.field_id, field.name, UnknownType())) return StructType(*nested_fields) def partition_to_path(self, data: Record, schema: Schema) -> str: @@ -254,7 +258,7 @@ def validate_partition_name( partition_transform: Transform[Any, Any], source_id: int, schema: Schema, - partition_names: Set[str], + partition_names: set[str], ) -> None: """Validate that a partition field name doesn't conflict with schema field names.""" try: @@ -372,7 +376,7 @@ def unknown(self, field_id: int, source_name: str, source_id: int, transform: st @singledispatch -def _visit(spec: PartitionSpec, schema: Schema, visitor: PartitionSpecVisitor[R]) -> List[R]: +def _visit(spec: PartitionSpec, schema: Schema, visitor: PartitionSpecVisitor[R]) -> list[R]: return [_visit_partition_field(schema, field, visitor) for field in spec.fields] @@ -412,7 +416,7 @@ class PartitionFieldValue: @dataclass(frozen=True) class PartitionKey: - field_values: List[PartitionFieldValue] + field_values: list[PartitionFieldValue] partition_spec: PartitionSpec schema: Schema diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index f6e4684b91..57ef915c04 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -17,20 +17,17 @@ # pylint: disable=W0511 from __future__ import annotations +import builtins import itertools from abc import ABC, abstractmethod +from collections.abc import Callable from dataclasses import dataclass from functools import cached_property, partial, singledispatch from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, Generic, - List, Literal, - Set, - Tuple, TypeVar, ) @@ -89,11 +86,11 @@ class Schema(IcebergBaseModel): """ type: Literal["struct"] = "struct" - fields: Tuple[NestedField, ...] = Field(default_factory=tuple) + fields: tuple[NestedField, ...] = Field(default_factory=tuple) schema_id: int = Field(alias="schema-id", default=INITIAL_SCHEMA_ID) - identifier_field_ids: List[int] = Field(alias="identifier-field-ids", default_factory=list) + identifier_field_ids: list[int] = Field(alias="identifier-field-ids", default_factory=list) - _name_to_id: Dict[str, int] = PrivateAttr() + _name_to_id: dict[str, int] = PrivateAttr() def __init__(self, *fields: NestedField, **data: Any): if fields: @@ -138,12 +135,12 @@ def check_schema(self) -> Schema: return self @property - def columns(self) -> Tuple[NestedField, ...]: + def columns(self) -> tuple[NestedField, ...]: """A tuple of the top-level fields.""" return self.fields @cached_property - def _lazy_id_to_field(self) -> Dict[int, NestedField]: + def _lazy_id_to_field(self) -> dict[int, NestedField]: """Return an index of field ID to NestedField instance. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -151,7 +148,7 @@ def _lazy_id_to_field(self) -> Dict[int, NestedField]: return index_by_id(self) @cached_property - def _lazy_id_to_parent(self) -> Dict[int, int]: + def _lazy_id_to_parent(self) -> dict[int, int]: """Returns an index of field ID to parent field IDs. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -159,7 +156,7 @@ def _lazy_id_to_parent(self) -> Dict[int, int]: return _index_parents(self) @cached_property - def _lazy_name_to_id_lower(self) -> Dict[str, int]: + def _lazy_name_to_id_lower(self) -> dict[str, int]: """Return an index of lower-case field names to field IDs. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -167,7 +164,7 @@ def _lazy_name_to_id_lower(self) -> Dict[str, int]: return {name.lower(): field_id for name, field_id in self._name_to_id.items()} @cached_property - def _lazy_id_to_name(self) -> Dict[int, str]: + def _lazy_id_to_name(self) -> dict[int, str]: """Return an index of field ID to full name. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -175,7 +172,7 @@ def _lazy_id_to_name(self) -> Dict[int, str]: return index_name_by_id(self) @cached_property - def _lazy_id_to_accessor(self) -> Dict[int, Accessor]: + def _lazy_id_to_accessor(self) -> dict[int, Accessor]: """Return an index of field ID to accessor. This is calculated once when called for the first time. Subsequent calls to this method will use a cached index. @@ -186,7 +183,7 @@ def as_struct(self) -> StructType: """Return the schema as a struct.""" return StructType(*self.fields) - def as_arrow(self) -> "pa.Schema": + def as_arrow(self) -> pa.Schema: """Return the schema as an Arrow schema.""" from pyiceberg.io.pyarrow import schema_to_pyarrow @@ -257,7 +254,7 @@ def find_column_name(self, column_id: int) -> str | None: return self._lazy_id_to_name.get(column_id) @property - def column_names(self) -> List[str]: + def column_names(self) -> list[str]: """ Return a list of all the column names, including nested fields. @@ -285,7 +282,7 @@ def accessor_for_field(self, field_id: int) -> Accessor: return self._lazy_id_to_accessor[field_id] - def identifier_field_names(self) -> Set[str]: + def identifier_field_names(self) -> set[str]: """Return the names of the identifier fields. Returns: @@ -324,7 +321,7 @@ def select(self, *names: str, case_sensitive: bool = True) -> Schema: return prune_columns(self, ids) @property - def field_ids(self) -> Set[int]: + def field_ids(self) -> set[int]: """Return the IDs of the current schema.""" return set(self._name_to_id.values()) @@ -350,7 +347,7 @@ def _validate_identifier_field(self, field_id: int) -> None: # Check whether the nested field is in a chain of required struct fields # Exploring from root for better error message for list and map types parent_id = self._lazy_id_to_parent.get(field.field_id) - fields: List[int] = [] + fields: list[int] = [] while parent_id is not None: fields.append(parent_id) parent_id = self._lazy_id_to_parent.get(parent_id) @@ -417,7 +414,7 @@ def schema(self, schema: Schema, struct_result: T) -> T: """Visit a Schema.""" @abstractmethod - def struct(self, struct: StructType, field_results: List[T]) -> T: + def struct(self, struct: StructType, field_results: builtins.list[T]) -> T: """Visit a StructType.""" @abstractmethod @@ -443,7 +440,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], T]) -> T: """Visit a Schema.""" @abstractmethod - def struct(self, struct: StructType, field_results: List[Callable[[], T]]) -> T: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], T]]) -> T: """Visit a StructType.""" @abstractmethod @@ -499,7 +496,7 @@ def schema(self, schema: Schema, schema_partner: P | None, struct_result: T) -> """Visit a schema with a partner.""" @abstractmethod - def struct(self, struct: StructType, struct_partner: P | None, field_results: List[T]) -> T: + def struct(self, struct: StructType, struct_partner: P | None, field_results: builtins.list[T]) -> T: """Visit a struct type with a partner.""" @abstractmethod @@ -979,41 +976,41 @@ def _(obj: PrimitiveType, visitor: PreOrderSchemaVisitor[T]) -> T: return visitor.primitive(obj) -class _IndexById(SchemaVisitor[Dict[int, NestedField]]): +class _IndexById(SchemaVisitor[dict[int, NestedField]]): """A schema visitor for generating a field ID to NestedField index.""" def __init__(self) -> None: - self._index: Dict[int, NestedField] = {} + self._index: dict[int, NestedField] = {} - def schema(self, schema: Schema, struct_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def schema(self, schema: Schema, struct_result: dict[int, NestedField]) -> dict[int, NestedField]: return self._index - def struct(self, struct: StructType, field_results: List[Dict[int, NestedField]]) -> Dict[int, NestedField]: + def struct(self, struct: StructType, field_results: builtins.list[dict[int, NestedField]]) -> dict[int, NestedField]: return self._index - def field(self, field: NestedField, field_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def field(self, field: NestedField, field_result: dict[int, NestedField]) -> dict[int, NestedField]: """Add the field ID to the index.""" self._index[field.field_id] = field return self._index - def list(self, list_type: ListType, element_result: Dict[int, NestedField]) -> Dict[int, NestedField]: + def list(self, list_type: ListType, element_result: dict[int, NestedField]) -> dict[int, NestedField]: """Add the list element ID to the index.""" self._index[list_type.element_field.field_id] = list_type.element_field return self._index def map( - self, map_type: MapType, key_result: Dict[int, NestedField], value_result: Dict[int, NestedField] - ) -> Dict[int, NestedField]: + self, map_type: MapType, key_result: dict[int, NestedField], value_result: dict[int, NestedField] + ) -> dict[int, NestedField]: """Add the key ID and value ID as individual items in the index.""" self._index[map_type.key_field.field_id] = map_type.key_field self._index[map_type.value_field.field_id] = map_type.value_field return self._index - def primitive(self, primitive: PrimitiveType) -> Dict[int, NestedField]: + def primitive(self, primitive: PrimitiveType) -> dict[int, NestedField]: return self._index -def index_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, NestedField]: +def index_by_id(schema_or_type: Schema | IcebergType) -> dict[int, NestedField]: """Generate an index of field IDs to NestedField instances. Args: @@ -1025,10 +1022,10 @@ def index_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, NestedField]: return visit(schema_or_type, _IndexById()) -class _IndexParents(SchemaVisitor[Dict[int, int]]): +class _IndexParents(SchemaVisitor[dict[int, int]]): def __init__(self) -> None: - self.id_to_parent: Dict[int, int] = {} - self.id_stack: List[int] = [] + self.id_to_parent: dict[int, int] = {} + self.id_stack: list[int] = [] def before_field(self, field: NestedField) -> None: self.id_stack.append(field.field_id) @@ -1036,10 +1033,10 @@ def before_field(self, field: NestedField) -> None: def after_field(self, field: NestedField) -> None: self.id_stack.pop() - def schema(self, schema: Schema, struct_result: Dict[int, int]) -> Dict[int, int]: + def schema(self, schema: Schema, struct_result: dict[int, int]) -> dict[int, int]: return self.id_to_parent - def struct(self, struct: StructType, field_results: List[Dict[int, int]]) -> Dict[int, int]: + def struct(self, struct: StructType, field_results: builtins.list[dict[int, int]]) -> dict[int, int]: for field in struct.fields: parent_id = self.id_stack[-1] if self.id_stack else None if parent_id is not None: @@ -1048,23 +1045,23 @@ def struct(self, struct: StructType, field_results: List[Dict[int, int]]) -> Dic return self.id_to_parent - def field(self, field: NestedField, field_result: Dict[int, int]) -> Dict[int, int]: + def field(self, field: NestedField, field_result: dict[int, int]) -> dict[int, int]: return self.id_to_parent - def list(self, list_type: ListType, element_result: Dict[int, int]) -> Dict[int, int]: + def list(self, list_type: ListType, element_result: dict[int, int]) -> dict[int, int]: self.id_to_parent[list_type.element_id] = self.id_stack[-1] return self.id_to_parent - def map(self, map_type: MapType, key_result: Dict[int, int], value_result: Dict[int, int]) -> Dict[int, int]: + def map(self, map_type: MapType, key_result: dict[int, int], value_result: dict[int, int]) -> dict[int, int]: self.id_to_parent[map_type.key_id] = self.id_stack[-1] self.id_to_parent[map_type.value_id] = self.id_stack[-1] return self.id_to_parent - def primitive(self, primitive: PrimitiveType) -> Dict[int, int]: + def primitive(self, primitive: PrimitiveType) -> dict[int, int]: return self.id_to_parent -def _index_parents(schema_or_type: Schema | IcebergType) -> Dict[int, int]: +def _index_parents(schema_or_type: Schema | IcebergType) -> dict[int, int]: """Generate an index of field IDs to their parent field IDs. Args: @@ -1076,15 +1073,15 @@ def _index_parents(schema_or_type: Schema | IcebergType) -> Dict[int, int]: return visit(schema_or_type, _IndexParents()) -class _IndexByName(SchemaVisitor[Dict[str, int]]): +class _IndexByName(SchemaVisitor[dict[str, int]]): """A schema visitor for generating a field name to field ID index.""" def __init__(self) -> None: - self._index: Dict[str, int] = {} - self._short_name_to_id: Dict[str, int] = {} - self._combined_index: Dict[str, int] = {} - self._field_names: List[str] = [] - self._short_field_names: List[str] = [] + self._index: dict[str, int] = {} + self._short_name_to_id: dict[str, int] = {} + self._combined_index: dict[str, int] = {} + self._field_names: list[str] = [] + self._short_field_names: list[str] = [] def before_map_value(self, value: NestedField) -> None: if not isinstance(value.field_type, StructType): @@ -1117,23 +1114,23 @@ def after_field(self, field: NestedField) -> None: self._field_names.pop() self._short_field_names.pop() - def schema(self, schema: Schema, struct_result: Dict[str, int]) -> Dict[str, int]: + def schema(self, schema: Schema, struct_result: dict[str, int]) -> dict[str, int]: return self._index - def struct(self, struct: StructType, field_results: List[Dict[str, int]]) -> Dict[str, int]: + def struct(self, struct: StructType, field_results: builtins.list[dict[str, int]]) -> dict[str, int]: return self._index - def field(self, field: NestedField, field_result: Dict[str, int]) -> Dict[str, int]: + def field(self, field: NestedField, field_result: dict[str, int]) -> dict[str, int]: """Add the field name to the index.""" self._add_field(field.name, field.field_id) return self._index - def list(self, list_type: ListType, element_result: Dict[str, int]) -> Dict[str, int]: + def list(self, list_type: ListType, element_result: dict[str, int]) -> dict[str, int]: """Add the list element name to the index.""" self._add_field(list_type.element_field.name, list_type.element_field.field_id) return self._index - def map(self, map_type: MapType, key_result: Dict[str, int], value_result: Dict[str, int]) -> Dict[str, int]: + def map(self, map_type: MapType, key_result: dict[str, int], value_result: dict[str, int]) -> dict[str, int]: """Add the key name and value name as individual items in the index.""" self._add_field(map_type.key_field.name, map_type.key_field.field_id) self._add_field(map_type.value_field.name, map_type.value_field.field_id) @@ -1162,10 +1159,10 @@ def _add_field(self, name: str, field_id: int) -> None: short_name = ".".join([".".join(self._short_field_names), name]) self._short_name_to_id[short_name] = field_id - def primitive(self, primitive: PrimitiveType) -> Dict[str, int]: + def primitive(self, primitive: PrimitiveType) -> dict[str, int]: return self._index - def by_name(self) -> Dict[str, int]: + def by_name(self) -> dict[str, int]: """Return an index of combined full and short names. Note: Only short names that do not conflict with full names are included. @@ -1174,13 +1171,13 @@ def by_name(self) -> Dict[str, int]: combined_index.update(self._index) return combined_index - def by_id(self) -> Dict[int, str]: + def by_id(self) -> dict[int, str]: """Return an index of ID to full names.""" id_to_full_name = {value: key for key, value in self._index.items()} return id_to_full_name -def index_by_name(schema_or_type: Schema | IcebergType) -> Dict[str, int]: +def index_by_name(schema_or_type: Schema | IcebergType) -> dict[str, int]: """Generate an index of field names to field IDs. Args: @@ -1197,7 +1194,7 @@ def index_by_name(schema_or_type: Schema | IcebergType) -> Dict[str, int]: return EMPTY_DICT -def index_name_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, str]: +def index_name_by_id(schema_or_type: Schema | IcebergType) -> dict[int, str]: """Generate an index of field IDs full field names. Args: @@ -1214,7 +1211,7 @@ def index_name_by_id(schema_or_type: Schema | IcebergType) -> Dict[int, str]: Position = int -class _BuildPositionAccessors(SchemaVisitor[Dict[Position, Accessor]]): +class _BuildPositionAccessors(SchemaVisitor[dict[Position, Accessor]]): """A schema visitor for generating a field ID to accessor index. Example: @@ -1247,10 +1244,10 @@ class _BuildPositionAccessors(SchemaVisitor[Dict[Position, Accessor]]): True """ - def schema(self, schema: Schema, struct_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def schema(self, schema: Schema, struct_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return struct_result - def struct(self, struct: StructType, field_results: List[Dict[Position, Accessor]]) -> Dict[Position, Accessor]: + def struct(self, struct: StructType, field_results: builtins.list[dict[Position, Accessor]]) -> dict[Position, Accessor]: result = {} for position, field in enumerate(struct.fields): @@ -1261,22 +1258,22 @@ def struct(self, struct: StructType, field_results: List[Dict[Position, Accessor return result - def field(self, field: NestedField, field_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def field(self, field: NestedField, field_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return field_result - def list(self, list_type: ListType, element_result: Dict[Position, Accessor]) -> Dict[Position, Accessor]: + def list(self, list_type: ListType, element_result: dict[Position, Accessor]) -> dict[Position, Accessor]: return {} def map( - self, map_type: MapType, key_result: Dict[Position, Accessor], value_result: Dict[Position, Accessor] - ) -> Dict[Position, Accessor]: + self, map_type: MapType, key_result: dict[Position, Accessor], value_result: dict[Position, Accessor] + ) -> dict[Position, Accessor]: return {} - def primitive(self, primitive: PrimitiveType) -> Dict[Position, Accessor]: + def primitive(self, primitive: PrimitiveType) -> dict[Position, Accessor]: return {} -def build_position_accessors(schema_or_type: Schema | IcebergType) -> Dict[int, Accessor]: +def build_position_accessors(schema_or_type: Schema | IcebergType) -> dict[int, Accessor]: """Generate an index of field IDs to schema position accessors. Args: @@ -1296,7 +1293,7 @@ def assign_fresh_schema_ids(schema_or_type: Schema | IcebergType, next_id: Calla class _SetFreshIDs(PreOrderSchemaVisitor[IcebergType]): """Traverses the schema and assigns monotonically increasing ids.""" - old_id_to_new_id: Dict[int, int] + old_id_to_new_id: dict[int, int] def __init__(self, next_id_func: Callable[[], int] | None = None) -> None: self.old_id_to_new_id = {} @@ -1314,7 +1311,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], StructType]) -> Sch identifier_field_ids=[self.old_id_to_new_id[field_id] for field_id in schema.identifier_field_ids], ) - def struct(self, struct: StructType, field_results: List[Callable[[], IcebergType]]) -> StructType: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], IcebergType]]) -> StructType: new_ids = [self._get_and_increment(field.field_id) for field in struct.fields] new_fields = [] for field_id, field, field_type in zip(new_ids, struct.fields, field_results, strict=True): @@ -1445,7 +1442,7 @@ def field(self, field: NestedField, field_result: IcebergType | None) -> Iceberg required=field.required, ) - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: return StructType(*[field for field in field_results if field is not None]) def list(self, list_type: ListType, element_result: IcebergType | None) -> IcebergType | None: @@ -1464,7 +1461,7 @@ def primitive(self, primitive: PrimitiveType) -> IcebergType | None: return primitive -def prune_columns(schema: Schema, selected: Set[int], select_full_types: bool = True) -> Schema: +def prune_columns(schema: Schema, selected: set[int], select_full_types: bool = True) -> Schema: """Prunes a column by only selecting a set of field-ids. Args: @@ -1484,17 +1481,17 @@ def prune_columns(schema: Schema, selected: Set[int], select_full_types: bool = class _PruneColumnsVisitor(SchemaVisitor[IcebergType | None]): - selected: Set[int] + selected: set[int] select_full_types: bool - def __init__(self, selected: Set[int], select_full_types: bool): + def __init__(self, selected: set[int], select_full_types: bool): self.selected = selected self.select_full_types = select_full_types def schema(self, schema: Schema, struct_result: IcebergType | None) -> IcebergType | None: return struct_result - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: fields = struct.fields selected_fields = [] same_type = True @@ -1781,7 +1778,7 @@ def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool: raise ValueError(f"Mismatch in fields:\n{self.console.export_text()}") return result - def struct(self, struct: StructType, field_results: List[Callable[[], bool]]) -> bool: + def struct(self, struct: StructType, field_results: builtins.list[Callable[[], bool]]) -> bool: results = [result() for result in field_results] return all(results) diff --git a/pyiceberg/serializers.py b/pyiceberg/serializers.py index e2994884c6..726e6b5f62 100644 --- a/pyiceberg/serializers.py +++ b/pyiceberg/serializers.py @@ -19,7 +19,7 @@ import codecs import gzip from abc import ABC, abstractmethod -from typing import Callable +from collections.abc import Callable from pyiceberg.io import InputFile, InputStream, OutputFile from pyiceberg.table.metadata import TableMetadata, TableMetadataUtil diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index cc7c1c6af0..2e26a4ccc2 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -21,6 +21,7 @@ import uuid import warnings from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable, Iterator from dataclasses import dataclass from functools import cached_property from itertools import chain @@ -28,14 +29,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, - Iterable, - Iterator, - List, - Set, - Tuple, - Type, TypeVar, ) @@ -254,8 +247,8 @@ class TableProperties: class Transaction: _table: Table _autocommit: bool - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] def __init__(self, table: Table, autocommit: bool = False): """Open a transaction to stage and commit changes to a table. @@ -277,12 +270,12 @@ def __enter__(self) -> Transaction: """Start a transaction to update the table.""" return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: """Close and commit the transaction if no exceptions have been raised.""" if exctype is None and excinst is None and exctb is None: self.commit_transaction() - def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...] = ()) -> Transaction: + def _apply(self, updates: tuple[TableUpdate, ...], requirements: tuple[TableRequirement, ...] = ()) -> Transaction: """Check if the requirements are met, and applies the updates to the metadata.""" for requirement in requirements: requirement.validate(self.table_metadata) @@ -377,7 +370,7 @@ def _set_ref_snapshot( return updates, requirements - def _build_partition_predicate(self, partition_records: Set[Record]) -> BooleanExpression: + def _build_partition_predicate(self, partition_records: set[Record]) -> BooleanExpression: """Build a filter predicate matching any of the input partition records. Args: @@ -404,7 +397,7 @@ def _build_partition_predicate(self, partition_records: Set[Record]) -> BooleanE return expr def _append_snapshot_producer( - self, snapshot_properties: Dict[str, str], branch: str | None = MAIN_BRANCH + self, snapshot_properties: dict[str, str], branch: str | None = MAIN_BRANCH ) -> _FastAppendFiles: """Determine the append type based on table properties. @@ -453,7 +446,7 @@ def update_sort_order(self, case_sensitive: bool = True) -> UpdateSortOrder: ) def update_snapshot( - self, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> UpdateSnapshot: """Create a new UpdateSnapshot to produce a new snapshot for the table. @@ -471,7 +464,7 @@ def update_statistics(self) -> UpdateStatistics: """ return UpdateStatistics(transaction=self) - def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: + def append(self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: """ Shorthand API for appending a PyArrow table to a table transaction. @@ -510,7 +503,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, append_files.append_data_file(data_file) def dynamic_partition_overwrite( - self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> None: """ Shorthand for overwriting existing partitions with a PyArrow table. @@ -556,7 +549,7 @@ def dynamic_partition_overwrite( return append_snapshot_commit_uuid = uuid.uuid4() - data_files: List[DataFile] = list( + data_files: list[DataFile] = list( _dataframe_to_data_files( table_metadata=self._table.metadata, write_uuid=append_snapshot_commit_uuid, df=df, io=self._table.io ) @@ -575,7 +568,7 @@ def overwrite( self, df: pa.Table, overwrite_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -635,7 +628,7 @@ def overwrite( def delete( self, delete_filter: str | BooleanExpression, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -663,7 +656,7 @@ def delete( self.table_metadata.properties.get(TableProperties.DELETE_MODE, TableProperties.DELETE_MODE_DEFAULT) == TableProperties.DELETE_MODE_MERGE_ON_READ ): - warnings.warn("Merge on read is not yet supported, falling back to copy-on-write") + warnings.warn("Merge on read is not yet supported, falling back to copy-on-write", stacklevel=2) if isinstance(delete_filter, str): delete_filter = _parse_row_filter(delete_filter) @@ -684,7 +677,7 @@ def delete( commit_uuid = uuid.uuid4() counter = itertools.count(0) - replaced_files: List[Tuple[DataFile, List[DataFile]]] = [] + replaced_files: list[tuple[DataFile, list[DataFile]]] = [] # This will load the Parquet file into memory, including: # - Filter out the rows based on the delete filter # - Projecting it to the current schema @@ -731,12 +724,12 @@ def delete( overwrite_snapshot.append_data_file(replaced_data_file) if not delete_snapshot.files_affected and not delete_snapshot.rewrites_needed: - warnings.warn("Delete operation did not match any records") + warnings.warn("Delete operation did not match any records", stacklevel=2) def upsert( self, df: pa.Table, - join_cols: List[str] | None = None, + join_cols: list[str] | None = None, when_matched_update_all: bool = True, when_not_matched_insert_all: bool = True, case_sensitive: bool = True, @@ -879,8 +872,8 @@ def upsert( def add_files( self, - file_paths: List[str], - snapshot_properties: Dict[str, str] = EMPTY_DICT, + file_paths: list[str], + snapshot_properties: dict[str, str] = EMPTY_DICT, check_duplicate_files: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1025,10 +1018,10 @@ def commit_transaction(self) -> Table: return self._table -class Namespace(IcebergRootModel[List[str]]): +class Namespace(IcebergRootModel[list[str]]): """Reference to one or more levels of a namespace.""" - root: List[str] = Field( + root: list[str] = Field( ..., description="Reference to one or more levels of a namespace", ) @@ -1045,8 +1038,8 @@ class CommitTableRequest(IcebergBaseModel): """A pydantic BaseModel for a table commit request.""" identifier: TableIdentifier = Field() - requirements: Tuple[TableRequirement, ...] = Field(default_factory=tuple) - updates: Tuple[TableUpdate, ...] = Field(default_factory=tuple) + requirements: tuple[TableRequirement, ...] = Field(default_factory=tuple) + updates: tuple[TableUpdate, ...] = Field(default_factory=tuple) class CommitTableResponse(IcebergBaseModel): @@ -1064,7 +1057,7 @@ class Table: metadata_location: str = Field() io: FileIO catalog: Catalog - config: Dict[str, str] + config: dict[str, str] def __init__( self, @@ -1073,7 +1066,7 @@ def __init__( metadata_location: str, io: FileIO, catalog: Catalog, - config: Dict[str, str] = EMPTY_DICT, + config: dict[str, str] = EMPTY_DICT, ) -> None: self._identifier = identifier self.metadata = metadata @@ -1131,7 +1124,7 @@ def name(self) -> Identifier: def scan( self, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1185,7 +1178,7 @@ def schema(self) -> Schema: """Return the schema for this table.""" return next(schema for schema in self.metadata.schemas if schema.schema_id == self.metadata.current_schema_id) - def schemas(self) -> Dict[int, Schema]: + def schemas(self) -> dict[int, Schema]: """Return a dict of the schema of this table.""" return {schema.schema_id: schema for schema in self.metadata.schemas} @@ -1193,7 +1186,7 @@ def spec(self) -> PartitionSpec: """Return the partition spec of this table.""" return next(spec for spec in self.metadata.partition_specs if spec.spec_id == self.metadata.default_spec_id) - def specs(self) -> Dict[int, PartitionSpec]: + def specs(self) -> dict[int, PartitionSpec]: """Return a dict the partition specs this table.""" return {spec.spec_id: spec for spec in self.metadata.partition_specs} @@ -1203,7 +1196,7 @@ def sort_order(self) -> SortOrder: sort_order for sort_order in self.metadata.sort_orders if sort_order.order_id == self.metadata.default_sort_order_id ) - def sort_orders(self) -> Dict[int, SortOrder]: + def sort_orders(self) -> dict[int, SortOrder]: """Return a dict of the sort orders of this table.""" return {sort_order.order_id: sort_order for sort_order in self.metadata.sort_orders} @@ -1214,7 +1207,7 @@ def last_partition_id(self) -> int: return PARTITION_FIELD_ID_START - 1 @property - def properties(self) -> Dict[str, str]: + def properties(self) -> dict[str, str]: """Properties of the table.""" return self.metadata.properties @@ -1236,7 +1229,7 @@ def current_snapshot(self) -> Snapshot | None: return self.snapshot_by_id(self.metadata.current_snapshot_id) return None - def snapshots(self) -> List[Snapshot]: + def snapshots(self) -> list[Snapshot]: return self.metadata.snapshots def snapshot_by_id(self, snapshot_id: int) -> Snapshot | None: @@ -1261,7 +1254,7 @@ def snapshot_as_of_timestamp(self, timestamp_ms: int, inclusive: bool = True) -> return self.snapshot_by_id(log_entry.snapshot_id) return None - def history(self) -> List[SnapshotLogEntry]: + def history(self) -> list[SnapshotLogEntry]: """Get the snapshot history of this table.""" return self.metadata.snapshot_log @@ -1329,7 +1322,7 @@ def name_mapping(self) -> NameMapping | None: def upsert( self, df: pa.Table, - join_cols: List[str] | None = None, + join_cols: list[str] | None = None, when_matched_update_all: bool = True, when_not_matched_insert_all: bool = True, case_sensitive: bool = True, @@ -1380,7 +1373,7 @@ def upsert( branch=branch, ) - def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: + def append(self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH) -> None: """ Shorthand API for appending a PyArrow table to the table. @@ -1393,7 +1386,7 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, tx.append(df=df, snapshot_properties=snapshot_properties, branch=branch) def dynamic_partition_overwrite( - self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH + self, df: pa.Table, snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH ) -> None: """Shorthand for dynamic overwriting the table with a PyArrow table. @@ -1410,7 +1403,7 @@ def overwrite( self, df: pa.Table, overwrite_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1443,7 +1436,7 @@ def overwrite( def delete( self, delete_filter: BooleanExpression | str = ALWAYS_TRUE, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, case_sensitive: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1463,8 +1456,8 @@ def delete( def add_files( self, - file_paths: List[str], - snapshot_properties: Dict[str, str] = EMPTY_DICT, + file_paths: list[str], + snapshot_properties: dict[str, str] = EMPTY_DICT, check_duplicate_files: bool = True, branch: str | None = MAIN_BRANCH, ) -> None: @@ -1488,11 +1481,11 @@ def add_files( def update_spec(self, case_sensitive: bool = True) -> UpdateSpec: return UpdateSpec(Transaction(self, autocommit=True), case_sensitive=case_sensitive) - def refs(self) -> Dict[str, SnapshotRef]: + def refs(self) -> dict[str, SnapshotRef]: """Return the snapshot references in the table.""" return self.metadata.refs - def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: + def _do_commit(self, updates: tuple[TableUpdate, ...], requirements: tuple[TableRequirement, ...]) -> None: response = self.catalog.commit_table(self, requirements, updates) # https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527 @@ -1502,7 +1495,7 @@ def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[Table try: self.catalog._delete_old_metadata(self.io, self.metadata, response.metadata) except Exception as e: - warnings.warn(f"Failed to delete old metadata after commit: {e}") + warnings.warn(f"Failed to delete old metadata after commit: {e}", stacklevel=2) self.metadata = response.metadata self.metadata_location = response.metadata_location @@ -1555,7 +1548,7 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) - def __datafusion_table_provider__(self) -> "IcebergDataFusionTable": + def __datafusion_table_provider__(self) -> IcebergDataFusionTable: """Return the DataFusion table provider PyCapsule interface. To support DataFusion features such as push down filtering, this function will return a PyCapsule @@ -1654,7 +1647,7 @@ def refresh(self) -> Table: def scan( self, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1686,7 +1679,7 @@ class TableScan(ABC): table_metadata: TableMetadata io: FileIO row_filter: BooleanExpression - selected_fields: Tuple[str, ...] + selected_fields: tuple[str, ...] case_sensitive: bool snapshot_id: int | None options: Properties @@ -1697,7 +1690,7 @@ def __init__( table_metadata: TableMetadata, io: FileIO, row_filter: str | BooleanExpression = ALWAYS_TRUE, - selected_fields: Tuple[str, ...] = ("*",), + selected_fields: tuple[str, ...] = ("*",), case_sensitive: bool = True, snapshot_id: int | None = None, options: Properties = EMPTY_DICT, @@ -1728,7 +1721,7 @@ def projection(self) -> Schema: schema for schema in self.table_metadata.schemas if schema.schema_id == snapshot.schema_id ) except StopIteration: - warnings.warn(f"Metadata does not contain schema with id: {snapshot.schema_id}") + warnings.warn(f"Metadata does not contain schema with id: {snapshot.schema_id}", stacklevel=2) else: raise ValueError(f"Snapshot not found: {self.snapshot_id}") @@ -1783,7 +1776,7 @@ def with_case_sensitive(self: S, case_sensitive: bool = True) -> S: def count(self) -> int: ... -class ScanTask(ABC): +class ScanTask: pass @@ -1792,23 +1785,17 @@ class FileScanTask(ScanTask): """Task representing a data file and its corresponding delete files.""" file: DataFile - delete_files: Set[DataFile] - start: int - length: int + delete_files: set[DataFile] residual: BooleanExpression def __init__( self, data_file: DataFile, - delete_files: Set[DataFile] | None = None, - start: int | None = None, - length: int | None = None, + delete_files: set[DataFile] | None = None, residual: BooleanExpression = ALWAYS_TRUE, ) -> None: self.file = data_file self.delete_files = delete_files or set() - self.start = start or 0 - self.length = length or data_file.file_size_in_bytes self.residual = residual @@ -1817,7 +1804,7 @@ def _open_manifest( manifest: ManifestFile, partition_filter: Callable[[DataFile], bool], metrics_evaluator: Callable[[DataFile], bool], -) -> List[ManifestEntry]: +) -> list[ManifestEntry]: """Open a manifest file and return matching manifest entries. Returns: @@ -1830,7 +1817,7 @@ def _open_manifest( ] -def _min_sequence_number(manifests: List[ManifestFile]) -> int: +def _min_sequence_number(manifests: list[ManifestFile]) -> int: try: return min( manifest.min_sequence_number or INITIAL_SEQUENCE_NUMBER @@ -1842,7 +1829,7 @@ def _min_sequence_number(manifests: List[ManifestFile]) -> int: return INITIAL_SEQUENCE_NUMBER -def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_entries: SortedList[ManifestEntry]) -> Set[DataFile]: +def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_entries: SortedList[ManifestEntry]) -> set[DataFile]: """Check if the delete file is relevant for the data file. Using the column metrics to see if the filename is in the lower and upper bound. @@ -1939,7 +1926,7 @@ def _check_sequence_number(min_sequence_number: int, manifest: ManifestFile) -> and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_sequence_number ) - def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: + def scan_plan_helper(self) -> Iterator[list[ManifestEntry]]: """Filter and return manifest entries based on partition and metrics evaluators. Returns: @@ -1952,7 +1939,7 @@ def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: # step 1: filter manifests using partition summaries # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id - manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) + manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) manifests = [ manifest_file @@ -1963,7 +1950,7 @@ def scan_plan_helper(self) -> Iterator[List[ManifestEntry]]: # step 2: filter the data files in each manifest # this filter depends on the partition spec used to write the manifest file - partition_evaluators: Dict[int, Callable[[DataFile], bool]] = KeyDefaultDict(self._build_partition_evaluator) + partition_evaluators: dict[int, Callable[[DataFile], bool]] = KeyDefaultDict(self._build_partition_evaluator) min_sequence_number = _min_sequence_number(manifests) @@ -1989,10 +1976,10 @@ def plan_files(self) -> Iterable[FileScanTask]: Returns: List of FileScanTasks that contain both data and delete files. """ - data_entries: List[ManifestEntry] = [] + data_entries: list[ManifestEntry] = [] positional_delete_entries = SortedList(key=lambda entry: entry.sequence_number or INITIAL_SEQUENCE_NUMBER) - residual_evaluators: Dict[int, Callable[[DataFile], ResidualEvaluator]] = KeyDefaultDict(self._build_residual_evaluator) + residual_evaluators: dict[int, Callable[[DataFile], ResidualEvaluator]] = KeyDefaultDict(self._build_residual_evaluator) for manifest_entry in chain.from_iterable(self.scan_plan_helper()): data_file = manifest_entry.data_file @@ -2138,7 +2125,7 @@ class WriteTask: write_uuid: uuid.UUID task_id: int schema: Schema - record_batches: List[pa.RecordBatch] + record_batches: list[pa.RecordBatch] sort_order_id: int | None = None partition_key: PartitionKey | None = None @@ -2148,7 +2135,7 @@ def generate_data_file_filename(self, extension: str) -> str: return f"00000-{self.task_id}-{self.write_uuid}.{extension}" -def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List[str], io: FileIO) -> Iterable[DataFile]: +def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: list[str], io: FileIO) -> Iterable[DataFile]: """Convert a list files into DataFiles. Returns: diff --git a/pyiceberg/table/inspect.py b/pyiceberg/table/inspect.py index c4591a40e9..bfe2fffa56 100644 --- a/pyiceberg/table/inspect.py +++ b/pyiceberg/table/inspect.py @@ -17,8 +17,9 @@ from __future__ import annotations import itertools +from collections.abc import Iterator from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Set, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.conversions import from_bytes from pyiceberg.expressions import AlwaysTrue, BooleanExpression @@ -60,7 +61,7 @@ def _get_snapshot(self, snapshot_id: int | None = None) -> Snapshot: else: raise ValueError("Cannot get a snapshot as the table does not have any.") - def snapshots(self) -> "pa.Table": + def snapshots(self) -> pa.Table: import pyarrow as pa snapshots_schema = pa.schema( @@ -98,7 +99,7 @@ def snapshots(self) -> "pa.Table": schema=snapshots_schema, ) - def entries(self, snapshot_id: int | None = None) -> "pa.Table": + def entries(self, snapshot_id: int | None = None) -> pa.Table: import pyarrow as pa from pyiceberg.io.pyarrow import schema_to_pyarrow @@ -229,7 +230,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: schema=entries_schema, ) - def refs(self) -> "pa.Table": + def refs(self) -> pa.Table: import pyarrow as pa ref_schema = pa.schema( @@ -264,7 +265,7 @@ def partitions( snapshot_id: int | None = None, row_filter: str | BooleanExpression = ALWAYS_TRUE, case_sensitive: bool = True, - ) -> "pa.Table": + ) -> pa.Table: import pyarrow as pa from pyiceberg.io.pyarrow import schema_to_pyarrow @@ -308,7 +309,7 @@ def partitions( snapshot_id=snapshot.snapshot_id, ) - partitions_map: Dict[Tuple[str, Any], Any] = {} + partitions_map: dict[tuple[str, Any], Any] = {} for entry in itertools.chain.from_iterable(scan.scan_plan_helper()): partition = entry.data_file.partition @@ -327,9 +328,9 @@ def partitions( def _update_partitions_map_from_manifest_entry( self, - partitions_map: Dict[Tuple[str, Any], Any], + partitions_map: dict[tuple[str, Any], Any], file: DataFile, - partition_record_dict: Dict[str, Any], + partition_record_dict: dict[str, Any], snapshot: Snapshot | None, ) -> None: partition_record_key = _convert_to_hashable_type(partition_record_dict) @@ -368,7 +369,7 @@ def _update_partitions_map_from_manifest_entry( else: raise ValueError(f"Unknown DataFileContent ({file.content})") - def _get_manifests_schema(self) -> "pa.Schema": + def _get_manifests_schema(self) -> pa.Schema: import pyarrow as pa partition_summary_schema = pa.struct( @@ -398,19 +399,19 @@ def _get_manifests_schema(self) -> "pa.Schema": ) return manifest_schema - def _get_all_manifests_schema(self) -> "pa.Schema": + def _get_all_manifests_schema(self) -> pa.Schema: import pyarrow as pa all_manifests_schema = self._get_manifests_schema() all_manifests_schema = all_manifests_schema.append(pa.field("reference_snapshot_id", pa.int64(), nullable=False)) return all_manifests_schema - def _generate_manifests_table(self, snapshot: Snapshot | None, is_all_manifests_table: bool = False) -> "pa.Table": + def _generate_manifests_table(self, snapshot: Snapshot | None, is_all_manifests_table: bool = False) -> pa.Table: import pyarrow as pa def _partition_summaries_to_rows( - spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] - ) -> List[Dict[str, Any]]: + spec: PartitionSpec, partition_summaries: list[PartitionFieldSummary] + ) -> list[dict[str, Any]]: rows = [] for i, field_summary in enumerate(partition_summaries): field = spec.fields[i] @@ -474,10 +475,10 @@ def _partition_summaries_to_rows( schema=self._get_all_manifests_schema() if is_all_manifests_table else self._get_manifests_schema(), ) - def manifests(self) -> "pa.Table": + def manifests(self) -> pa.Table: return self._generate_manifests_table(self.tbl.current_snapshot()) - def metadata_log_entries(self) -> "pa.Table": + def metadata_log_entries(self) -> pa.Table: import pyarrow as pa from pyiceberg.table.snapshots import MetadataLogEntry @@ -492,7 +493,7 @@ def metadata_log_entries(self) -> "pa.Table": ] ) - def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any]: + def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> dict[str, Any]: latest_snapshot = self.tbl.snapshot_as_of_timestamp(metadata_entry.timestamp_ms) return { "timestamp": metadata_entry.timestamp_ms, @@ -513,7 +514,7 @@ def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any schema=table_schema, ) - def history(self) -> "pa.Table": + def history(self) -> pa.Table: import pyarrow as pa history_schema = pa.schema( @@ -545,8 +546,8 @@ def history(self) -> "pa.Table": return pa.Table.from_pylist(history, schema=history_schema) def _get_files_from_manifest( - self, manifest_list: ManifestFile, data_file_filter: Set[DataFileContent] | None = None - ) -> "pa.Table": + self, manifest_list: ManifestFile, data_file_filter: set[DataFileContent] | None = None + ) -> pa.Table: import pyarrow as pa files: list[dict[str, Any]] = [] @@ -610,7 +611,7 @@ def _get_files_from_manifest( schema=self._get_files_schema(), ) - def _get_files_schema(self) -> "pa.Schema": + def _get_files_schema(self) -> pa.Schema: import pyarrow as pa from pyiceberg.io.pyarrow import schema_to_pyarrow @@ -663,7 +664,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: ) return files_schema - def _files(self, snapshot_id: int | None = None, data_file_filter: Set[DataFileContent] | None = None) -> "pa.Table": + def _files(self, snapshot_id: int | None = None, data_file_filter: set[DataFileContent] | None = None) -> pa.Table: import pyarrow as pa if not snapshot_id and not self.tbl.metadata.current_snapshot(): @@ -680,16 +681,16 @@ def _files(self, snapshot_id: int | None = None, data_file_filter: Set[DataFileC ) return pa.concat_tables(results) - def files(self, snapshot_id: int | None = None) -> "pa.Table": + def files(self, snapshot_id: int | None = None) -> pa.Table: return self._files(snapshot_id) - def data_files(self, snapshot_id: int | None = None) -> "pa.Table": + def data_files(self, snapshot_id: int | None = None) -> pa.Table: return self._files(snapshot_id, {DataFileContent.DATA}) - def delete_files(self, snapshot_id: int | None = None) -> "pa.Table": + def delete_files(self, snapshot_id: int | None = None) -> pa.Table: return self._files(snapshot_id, {DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) - def all_manifests(self) -> "pa.Table": + def all_manifests(self) -> pa.Table: import pyarrow as pa snapshots = self.tbl.snapshots() @@ -697,12 +698,12 @@ def all_manifests(self) -> "pa.Table": return pa.Table.from_pylist([], schema=self._get_all_manifests_schema()) executor = ExecutorFactory.get_or_create() - manifests_by_snapshots: Iterator["pa.Table"] = executor.map( + manifests_by_snapshots: Iterator[pa.Table] = executor.map( lambda args: self._generate_manifests_table(*args), [(snapshot, True) for snapshot in snapshots] ) return pa.concat_tables(manifests_by_snapshots) - def _all_files(self, data_file_filter: Set[DataFileContent] | None = None) -> "pa.Table": + def _all_files(self, data_file_filter: set[DataFileContent] | None = None) -> pa.Table: import pyarrow as pa snapshots = self.tbl.snapshots() @@ -720,11 +721,11 @@ def _all_files(self, data_file_filter: Set[DataFileContent] | None = None) -> "p return pa.concat_tables(file_lists) - def all_files(self) -> "pa.Table": + def all_files(self) -> pa.Table: return self._all_files() - def all_data_files(self) -> "pa.Table": + def all_data_files(self) -> pa.Table: return self._all_files({DataFileContent.DATA}) - def all_delete_files(self) -> "pa.Table": + def all_delete_files(self) -> pa.Table: return self._all_files({DataFileContent.POSITION_DELETES, DataFileContent.EQUALITY_DELETES}) diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 201aaee511..8ae930375a 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -19,7 +19,7 @@ import datetime import uuid from copy import copy -from typing import Annotated, Any, Dict, List, Literal +from typing import Annotated, Any, Literal from pydantic import Field, field_serializer, field_validator, model_validator from pydantic import ValidationError as PydanticValidationError @@ -68,7 +68,7 @@ SUPPORTED_TABLE_FORMAT_VERSION = 2 -def cleanup_snapshot_id(data: Dict[str, Any]) -> Dict[str, Any]: +def cleanup_snapshot_id(data: dict[str, Any]) -> dict[str, Any]: """Run before validation.""" if CURRENT_SNAPSHOT_ID in data and data[CURRENT_SNAPSHOT_ID] == -1: # We treat -1 and None the same, by cleaning this up @@ -92,7 +92,7 @@ def check_partition_specs(table_metadata: TableMetadata) -> TableMetadata: """Check if the default-spec-id is present in partition-specs.""" default_spec_id = table_metadata.default_spec_id - partition_specs: List[PartitionSpec] = table_metadata.partition_specs + partition_specs: list[PartitionSpec] = table_metadata.partition_specs for spec in partition_specs: if spec.spec_id == default_spec_id: return table_metadata @@ -105,7 +105,7 @@ def check_sort_orders(table_metadata: TableMetadata) -> TableMetadata: default_sort_order_id: int = table_metadata.default_sort_order_id if default_sort_order_id != UNSORTED_SORT_ORDER_ID: - sort_orders: List[SortOrder] = table_metadata.sort_orders + sort_orders: list[SortOrder] = table_metadata.sort_orders for sort_order in sort_orders: if sort_order.order_id == default_sort_order_id: return table_metadata @@ -151,13 +151,13 @@ class TableMetadataCommonFields(IcebergBaseModel): This is used to ensure fields are always assigned an unused ID when evolving schemas.""" - schemas: List[Schema] = Field(default_factory=list) + schemas: list[Schema] = Field(default_factory=list) """A list of schemas, stored as objects with schema-id.""" current_schema_id: int = Field(alias="current-schema-id", default=DEFAULT_SCHEMA_ID) """ID of the table’s current schema.""" - partition_specs: List[PartitionSpec] = Field(alias="partition-specs", default_factory=list) + partition_specs: list[PartitionSpec] = Field(alias="partition-specs", default_factory=list) """A list of partition specs, stored as full partition spec objects.""" default_spec_id: int = Field(alias="default-spec-id", default=INITIAL_SPEC_ID) @@ -168,7 +168,7 @@ class TableMetadataCommonFields(IcebergBaseModel): partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs.""" - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) """A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, commit.retry.num-retries @@ -177,13 +177,13 @@ class TableMetadataCommonFields(IcebergBaseModel): current_snapshot_id: int | None = Field(alias="current-snapshot-id", default=None) """ID of the current table snapshot.""" - snapshots: List[Snapshot] = Field(default_factory=list) + snapshots: list[Snapshot] = Field(default_factory=list) """A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected.""" - snapshot_log: List[SnapshotLogEntry] = Field(alias="snapshot-log", default_factory=list) + snapshot_log: list[SnapshotLogEntry] = Field(alias="snapshot-log", default_factory=list) """A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the @@ -191,7 +191,7 @@ class TableMetadataCommonFields(IcebergBaseModel): expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed.""" - metadata_log: List[MetadataLogEntry] = Field(alias="metadata-log", default_factory=list) + metadata_log: list[MetadataLogEntry] = Field(alias="metadata-log", default_factory=list) """A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata @@ -199,7 +199,7 @@ class TableMetadataCommonFields(IcebergBaseModel): remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit.""" - sort_orders: List[SortOrder] = Field(alias="sort-orders", default_factory=list) + sort_orders: list[SortOrder] = Field(alias="sort-orders", default_factory=list) """A list of sort orders, stored as full sort order objects.""" default_sort_order_id: int = Field(alias="default-sort-order-id", default=UNSORTED_SORT_ORDER_ID) @@ -207,14 +207,14 @@ class TableMetadataCommonFields(IcebergBaseModel): writers, but is not used when reading because reads use the specs stored in manifest files.""" - refs: Dict[str, SnapshotRef] = Field(default_factory=dict) + refs: dict[str, SnapshotRef] = Field(default_factory=dict) """A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a main branch reference pointing to the current-snapshot-id even if the refs map is null.""" - statistics: List[StatisticsFile] = Field(default_factory=list) + statistics: list[StatisticsFile] = Field(default_factory=list) """A optional list of table statistics files. Table statistics files are valid Puffin files. Statistics are informational. A reader can choose to ignore statistics @@ -222,7 +222,7 @@ class TableMetadataCommonFields(IcebergBaseModel): table correctly. A table can contain many statistics files associated with different table snapshots.""" - partition_statistics: List[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list) + partition_statistics: list[PartitionStatisticsFile] = Field(alias="partition-statistics", default_factory=list) """A optional list of partition statistics files. Partition statistics are not required for reading or planning and readers may ignore them. Each table snapshot may be associated @@ -232,7 +232,7 @@ class TableMetadataCommonFields(IcebergBaseModel): # validators @field_validator("properties", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) def snapshot_by_id(self, snapshot_id: int) -> Snapshot | None: @@ -258,7 +258,7 @@ def spec(self) -> PartitionSpec: """Return the partition spec of this table.""" return next(spec for spec in self.partition_specs if spec.spec_id == self.default_spec_id) - def specs(self) -> Dict[int, PartitionSpec]: + def specs(self) -> dict[int, PartitionSpec]: """Return a dict the partition specs this table.""" return {spec.spec_id: spec for spec in self.partition_specs} @@ -323,7 +323,7 @@ def serialize_current_snapshot_id(self, current_snapshot_id: int | None) -> int return current_snapshot_id @field_serializer("snapshots") - def serialize_snapshots(self, snapshots: List[Snapshot]) -> List[Snapshot]: + def serialize_snapshots(self, snapshots: list[Snapshot]) -> list[Snapshot]: # Snapshot field `sequence-number` should not be written for v1 metadata if self.format_version == 1: return [snapshot.model_copy(update={"sequence_number": None}) for snapshot in snapshots] @@ -361,7 +361,7 @@ class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel): # to the owner of the table. @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -369,7 +369,7 @@ def construct_refs(self) -> TableMetadataV1: return construct_refs(self) @model_validator(mode="before") - def set_v2_compatible_defaults(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def set_v2_compatible_defaults(cls, data: dict[str, Any]) -> dict[str, Any]: """Set default values to be compatible with the format v2. Args: @@ -387,7 +387,7 @@ def set_v2_compatible_defaults(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def construct_schemas(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def construct_schemas(cls, data: dict[str, Any]) -> dict[str, Any]: """Convert the schema into schemas. For V1 schemas is optional, and if they aren't set, we'll set them @@ -406,7 +406,7 @@ def construct_schemas(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def construct_partition_specs(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def construct_partition_specs(cls, data: dict[str, Any]) -> dict[str, Any]: """Convert the partition_spec into partition_specs. For V1 partition_specs is optional, and if they aren't set, we'll set them @@ -441,7 +441,7 @@ def construct_partition_specs(cls, data: Dict[str, Any]) -> Dict[str, Any]: return data @model_validator(mode="before") - def set_sort_orders(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def set_sort_orders(cls, data: dict[str, Any]) -> dict[str, Any]: """Set the sort_orders if not provided. For V1 sort_orders is optional, and if they aren't set, we'll set them @@ -470,7 +470,7 @@ def to_v2(self) -> TableMetadataV2: """The table’s current schema. (Deprecated: use schemas and current-schema-id instead).""" - partition_spec: List[Dict[str, Any]] = Field(alias="partition-spec", default_factory=list) + partition_spec: list[dict[str, Any]] = Field(alias="partition-spec", default_factory=list) """The table’s current partition spec, stored as only fields. Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in @@ -490,7 +490,7 @@ class TableMetadataV2(TableMetadataCommonFields, IcebergBaseModel): """ @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -534,7 +534,7 @@ class TableMetadataV3(TableMetadataCommonFields, IcebergBaseModel): """ @model_validator(mode="before") - def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def cleanup_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: return cleanup_snapshot_id(data) @model_validator(mode="after") @@ -655,7 +655,7 @@ def parse_raw(data: str) -> TableMetadata: raise ValidationError(e) from e @staticmethod - def parse_obj(data: Dict[str, Any]) -> TableMetadata: + def parse_obj(data: dict[str, Any]) -> TableMetadata: if "format-version" not in data: raise ValidationError(f"Missing format-version in TableMetadata: {data}") format_version = data["format-version"] diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py index 1216daa2a4..1cf6bc0106 100644 --- a/pyiceberg/table/name_mapping.py +++ b/pyiceberg/table/name_mapping.py @@ -23,10 +23,12 @@ from __future__ import annotations +import builtins from abc import ABC, abstractmethod from collections import ChainMap +from collections.abc import Iterator from functools import cached_property, singledispatch -from typing import Any, Dict, Generic, Iterator, List, TypeVar +from typing import Any, Generic, TypeVar from pydantic import Field, conlist, field_validator, model_serializer @@ -37,8 +39,8 @@ class MappedField(IcebergBaseModel): field_id: int | None = Field(alias="field-id", default=None) - names: List[str] = conlist(str) - fields: List[MappedField] = Field(default_factory=list) + names: list[str] = conlist(str) + fields: list[MappedField] = Field(default_factory=list) @field_validator("fields", mode="before") @classmethod @@ -46,9 +48,9 @@ def convert_null_to_empty_List(cls, v: Any) -> Any: return v or [] @model_serializer - def ser_model(self) -> Dict[str, Any]: + def ser_model(self) -> dict[str, Any]: """Set custom serializer to leave out the field when it is empty.""" - serialized: Dict[str, Any] = {"names": self.names} + serialized: dict[str, Any] = {"names": self.names} if self.field_id is not None: serialized["field-id"] = self.field_id if len(self.fields) > 0: @@ -68,11 +70,11 @@ def __str__(self) -> str: return "([" + ", ".join(self.names) + "] -> " + field_id + fields_str + ")" -class NameMapping(IcebergRootModel[List[MappedField]]): - root: List[MappedField] +class NameMapping(IcebergRootModel[list[MappedField]]): + root: list[MappedField] @cached_property - def _field_by_name(self) -> Dict[str, MappedField]: + def _field_by_name(self) -> dict[str, MappedField]: return visit_name_mapping(self, _IndexByName()) def __len__(self) -> int: @@ -101,7 +103,7 @@ def mapping(self, nm: NameMapping, field_results: S) -> S: """Visit a NameMapping.""" @abstractmethod - def fields(self, struct: List[MappedField], field_results: List[T]) -> S: + def fields(self, struct: list[MappedField], field_results: list[T]) -> S: """Visit a List[MappedField].""" @abstractmethod @@ -109,15 +111,15 @@ def field(self, field: MappedField, field_result: S) -> T: """Visit a MappedField.""" -class _IndexByName(NameMappingVisitor[Dict[str, MappedField], Dict[str, MappedField]]): - def mapping(self, nm: NameMapping, field_results: Dict[str, MappedField]) -> Dict[str, MappedField]: +class _IndexByName(NameMappingVisitor[dict[str, MappedField], dict[str, MappedField]]): + def mapping(self, nm: NameMapping, field_results: dict[str, MappedField]) -> dict[str, MappedField]: return field_results - def fields(self, struct: List[MappedField], field_results: List[Dict[str, MappedField]]) -> Dict[str, MappedField]: + def fields(self, struct: list[MappedField], field_results: list[dict[str, MappedField]]) -> dict[str, MappedField]: return dict(ChainMap(*field_results)) - def field(self, field: MappedField, field_result: Dict[str, MappedField]) -> Dict[str, MappedField]: - result: Dict[str, MappedField] = { + def field(self, field: MappedField, field_result: dict[str, MappedField]) -> dict[str, MappedField]: + result: dict[str, MappedField] = { f"{field_name}.{key}": result_field for key, result_field in field_result.items() for field_name in field.names } @@ -128,7 +130,7 @@ def field(self, field: MappedField, field_result: Dict[str, MappedField]) -> Dic @singledispatch -def visit_name_mapping(obj: NameMapping | List[MappedField] | MappedField, visitor: NameMappingVisitor[S, T]) -> S: +def visit_name_mapping(obj: NameMapping | list[MappedField] | MappedField, visitor: NameMappingVisitor[S, T]) -> S: """Traverse the name mapping in post-order traversal.""" raise NotImplementedError(f"Cannot visit non-type: {obj}") @@ -139,7 +141,7 @@ def _(obj: NameMapping, visitor: NameMappingVisitor[S, T]) -> S: @visit_name_mapping.register(list) -def _(fields: List[MappedField], visitor: NameMappingVisitor[S, T]) -> S: +def _(fields: list[MappedField], visitor: NameMappingVisitor[S, T]) -> S: results = [visitor.field(field, visit_name_mapping(field.fields, visitor)) for field in fields] return visitor.fields(fields, results) @@ -148,42 +150,44 @@ def parse_mapping_from_json(mapping: str) -> NameMapping: return NameMapping.model_validate_json(mapping) -class _CreateMapping(SchemaVisitor[List[MappedField]]): - def schema(self, schema: Schema, struct_result: List[MappedField]) -> List[MappedField]: +class _CreateMapping(SchemaVisitor[list[MappedField]]): + def schema(self, schema: Schema, struct_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return struct_result - def struct(self, struct: StructType, field_results: List[List[MappedField]]) -> List[MappedField]: + def struct(self, struct: StructType, field_results: builtins.list[builtins.list[MappedField]]) -> builtins.list[MappedField]: return [ MappedField(field_id=field.field_id, names=[field.name], fields=result) for field, result in zip(struct.fields, field_results, strict=True) ] - def field(self, field: NestedField, field_result: List[MappedField]) -> List[MappedField]: + def field(self, field: NestedField, field_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return field_result - def list(self, list_type: ListType, element_result: List[MappedField]) -> List[MappedField]: + def list(self, list_type: ListType, element_result: builtins.list[MappedField]) -> builtins.list[MappedField]: return [MappedField(field_id=list_type.element_id, names=["element"], fields=element_result)] - def map(self, map_type: MapType, key_result: List[MappedField], value_result: List[MappedField]) -> List[MappedField]: + def map( + self, map_type: MapType, key_result: builtins.list[MappedField], value_result: builtins.list[MappedField] + ) -> builtins.list[MappedField]: return [ MappedField(field_id=map_type.key_id, names=["key"], fields=key_result), MappedField(field_id=map_type.value_id, names=["value"], fields=value_result), ] - def primitive(self, primitive: PrimitiveType) -> List[MappedField]: + def primitive(self, primitive: PrimitiveType) -> builtins.list[MappedField]: return [] -class _UpdateMapping(NameMappingVisitor[List[MappedField], MappedField]): - _updates: Dict[int, NestedField] - _adds: Dict[int, List[NestedField]] +class _UpdateMapping(NameMappingVisitor[list[MappedField], MappedField]): + _updates: dict[int, NestedField] + _adds: dict[int, list[NestedField]] - def __init__(self, updates: Dict[int, NestedField], adds: Dict[int, List[NestedField]]): + def __init__(self, updates: dict[int, NestedField], adds: dict[int, list[NestedField]]): self._updates = updates self._adds = adds @staticmethod - def _remove_reassigned_names(field: MappedField, assignments: Dict[str, int]) -> MappedField | None: + def _remove_reassigned_names(field: MappedField, assignments: dict[str, int]) -> MappedField | None: removed_names = set() for name in field.names: if (assigned_id := assignments.get(name)) and assigned_id != field.field_id: @@ -195,10 +199,10 @@ def _remove_reassigned_names(field: MappedField, assignments: Dict[str, int]) -> else: return None - def _add_new_fields(self, mapped_fields: List[MappedField], parent_id: int) -> List[MappedField]: + def _add_new_fields(self, mapped_fields: list[MappedField], parent_id: int) -> list[MappedField]: if fields_to_add := self._adds.get(parent_id): - fields: List[MappedField] = [] - new_fields: List[MappedField] = [] + fields: list[MappedField] = [] + new_fields: list[MappedField] = [] for add in fields_to_add: new_fields.append( @@ -215,11 +219,11 @@ def _add_new_fields(self, mapped_fields: List[MappedField], parent_id: int) -> L else: return mapped_fields - def mapping(self, nm: NameMapping, field_results: List[MappedField]) -> List[MappedField]: + def mapping(self, nm: NameMapping, field_results: list[MappedField]) -> list[MappedField]: return self._add_new_fields(field_results, -1) - def fields(self, struct: List[MappedField], field_results: List[MappedField]) -> List[MappedField]: - reassignments: Dict[str, int] = { + def fields(self, struct: list[MappedField], field_results: list[MappedField]) -> list[MappedField]: + reassignments: dict[str, int] = { update.name: update.field_id for f in field_results if f.field_id is not None and (update := self._updates.get(f.field_id)) @@ -230,7 +234,7 @@ def fields(self, struct: List[MappedField], field_results: List[MappedField]) -> if (updated_field := self._remove_reassigned_names(field, reassignments)) is not None ] - def field(self, field: MappedField, field_result: List[MappedField]) -> MappedField: + def field(self, field: MappedField, field_result: list[MappedField]) -> MappedField: if field.field_id is None: return field field_names = field.names @@ -244,7 +248,7 @@ def create_mapping_from_schema(schema: Schema) -> NameMapping: return NameMapping(visit(schema, _CreateMapping())) -def update_mapping(mapping: NameMapping, updates: Dict[int, NestedField], adds: Dict[int, List[NestedField]]) -> NameMapping: +def update_mapping(mapping: NameMapping, updates: dict[int, NestedField], adds: dict[int, list[NestedField]]) -> NameMapping: return NameMapping(visit_name_mapping(mapping, _UpdateMapping(updates, adds))) @@ -253,7 +257,7 @@ def schema_partner(self, partner: MappedField | None) -> MappedField | None: return partner def field_partner( - self, partner_struct: List[MappedField] | MappedField | None, _: int, field_name: str + self, partner_struct: list[MappedField] | MappedField | None, _: int, field_name: str ) -> MappedField | None: if partner_struct is not None: if isinstance(partner_struct, MappedField): @@ -288,7 +292,7 @@ def map_value_partner(self, partner_map: MappedField | None) -> MappedField | No class NameMappingProjectionVisitor(SchemaWithPartnerVisitor[MappedField, IcebergType]): - current_path: List[str] + current_path: builtins.list[str] def __init__(self) -> None: # For keeping track where we are in case when a field cannot be found @@ -321,7 +325,9 @@ def after_map_value(self, value: NestedField, value_partner: P | None) -> None: def schema(self, schema: Schema, schema_partner: MappedField | None, struct_result: StructType) -> IcebergType: return Schema(*struct_result.fields, schema_id=schema.schema_id) - def struct(self, struct: StructType, struct_partner: MappedField | None, field_results: List[NestedField]) -> IcebergType: + def struct( + self, struct: StructType, struct_partner: MappedField | None, field_results: builtins.list[NestedField] + ) -> IcebergType: return StructType(*field_results) def field(self, field: NestedField, field_partner: MappedField | None, field_result: IcebergType) -> IcebergType: diff --git a/pyiceberg/table/puffin.py b/pyiceberg/table/puffin.py index 326fe3e37a..917d387f45 100644 --- a/pyiceberg/table/puffin.py +++ b/pyiceberg/table/puffin.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import math -from typing import TYPE_CHECKING, Dict, List, Literal +from typing import TYPE_CHECKING, Literal from pydantic import Field from pyroaring import BitMap, FrozenBitMap @@ -32,7 +32,7 @@ PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file" -def _deserialize_bitmap(pl: bytes) -> List[BitMap]: +def _deserialize_bitmap(pl: bytes) -> list[BitMap]: number_of_bitmaps = int.from_bytes(pl[0:8], byteorder="little") pl = pl[8:] @@ -64,21 +64,21 @@ def _deserialize_bitmap(pl: bytes) -> List[BitMap]: class PuffinBlobMetadata(IcebergBaseModel): type: Literal["deletion-vector-v1"] = Field() - fields: List[int] = Field() + fields: list[int] = Field() snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") offset: int = Field() length: int = Field() compression_codec: str | None = Field(alias="compression-codec", default=None) - properties: Dict[str, str] = Field(default_factory=dict) + properties: dict[str, str] = Field(default_factory=dict) class Footer(IcebergBaseModel): - blobs: List[PuffinBlobMetadata] = Field() - properties: Dict[str, str] = Field(default_factory=dict) + blobs: list[PuffinBlobMetadata] = Field() + properties: dict[str, str] = Field(default_factory=dict) -def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": +def _bitmaps_to_chunked_array(bitmaps: list[BitMap]) -> "pa.ChunkedArray": import pyarrow as pa return pa.chunked_array([(key_pos << 32) + pos for pos in bitmap] for key_pos, bitmap in enumerate(bitmaps)) @@ -86,7 +86,7 @@ def _bitmaps_to_chunked_array(bitmaps: List[BitMap]) -> "pa.ChunkedArray": class PuffinFile: footer: Footer - _deletion_vectors: Dict[str, List[BitMap]] + _deletion_vectors: dict[str, list[BitMap]] def __init__(self, puffin: bytes) -> None: for magic_bytes in [puffin[:4], puffin[-4:]]: @@ -112,5 +112,5 @@ def __init__(self, puffin: bytes) -> None: for blob in self.footer.blobs } - def to_vector(self) -> Dict[str, "pa.ChunkedArray"]: + def to_vector(self) -> dict[str, "pa.ChunkedArray"]: return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()} diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 14b5fa833c..4ef1645df6 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -19,8 +19,9 @@ import time import warnings from collections import defaultdict +from collections.abc import Iterable, Mapping from enum import Enum -from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping +from typing import TYPE_CHECKING, Any from pydantic import Field, PrivateAttr, model_serializer @@ -154,8 +155,8 @@ def remove_file(self, data_file: DataFile) -> None: else: raise ValueError(f"Unknown data file content: {data_file.content}") - def to_dict(self) -> Dict[str, str]: - properties: Dict[str, str] = {} + def to_dict(self) -> dict[str, str]: + properties: dict[str, str] = {} set_when_positive(properties, self.added_file_size, ADDED_FILE_SIZE) set_when_positive(properties, self.removed_file_size, REMOVED_FILE_SIZE) set_when_positive(properties, self.added_data_files, ADDED_DATA_FILES) @@ -183,11 +184,11 @@ class Summary(IcebergBaseModel, Mapping[str, str]): """ operation: Operation = Field() - _additional_properties: Dict[str, str] = PrivateAttr() + _additional_properties: dict[str, str] = PrivateAttr() def __init__(self, operation: Operation | None = None, **data: Any) -> None: if operation is None: - warnings.warn("Encountered invalid snapshot summary: operation is missing, defaulting to overwrite") + warnings.warn("Encountered invalid snapshot summary: operation is missing, defaulting to overwrite", stacklevel=2) operation = Operation.OVERWRITE super().__init__(operation=operation, **data) self._additional_properties = data @@ -212,14 +213,14 @@ def __len__(self) -> int: return 1 + len(self._additional_properties) @model_serializer - def ser_model(self) -> Dict[str, str]: + def ser_model(self) -> dict[str, str]: return { "operation": str(self.operation.value), **self._additional_properties, } @property - def additional_properties(self) -> Dict[str, str]: + def additional_properties(self) -> dict[str, str]: return self._additional_properties def __repr__(self) -> str: @@ -275,7 +276,7 @@ def __repr__(self) -> str: filtered_fields = [field for field in fields if field is not None] return f"Snapshot({', '.join(filtered_fields)})" - def manifests(self, io: FileIO) -> List[ManifestFile]: + def manifests(self, io: FileIO) -> list[ManifestFile]: """Return the manifests for the given snapshot.""" return list(_manifests(io, self.manifest_list)) @@ -292,7 +293,7 @@ class SnapshotLogEntry(IcebergBaseModel): class SnapshotSummaryCollector: metrics: UpdateMetrics - partition_metrics: DefaultDict[str, UpdateMetrics] + partition_metrics: defaultdict[str, UpdateMetrics] max_changed_partitions_for_summaries: int def __init__(self, partition_summary_limit: int = 0) -> None: @@ -324,7 +325,7 @@ def update_partition_metrics(self, partition_spec: PartitionSpec, file: DataFile else: partition_metrics.remove_file(file) - def build(self) -> Dict[str, str]: + def build(self) -> dict[str, str]: properties = self.metrics.to_dict() changed_partitions_size = len(self.partition_metrics) set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP) @@ -447,7 +448,7 @@ def _update_totals(total_property: str, added_property: str, removed_property: s return summary -def set_when_positive(properties: Dict[str, str], num: int, property_name: str) -> None: +def set_when_positive(properties: dict[str, str], num: int, property_name: str) -> None: if num > 0: properties[property_name] = str(num) diff --git a/pyiceberg/table/sorting.py b/pyiceberg/table/sorting.py index 8bd9a08176..5243d7b184 100644 --- a/pyiceberg/table/sorting.py +++ b/pyiceberg/table/sorting.py @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=keyword-arg-before-vararg +from collections.abc import Callable from enum import Enum -from typing import Annotated, Any, Callable, Dict, List +from typing import Annotated, Any from pydantic import ( BeforeValidator, @@ -88,7 +89,7 @@ def __init__( super().__init__(**data) @model_validator(mode="before") - def set_null_order(cls, values: Dict[str, Any]) -> Dict[str, Any]: + def set_null_order(cls, values: dict[str, Any]) -> dict[str, Any]: values["direction"] = values["direction"] if values.get("direction") else SortDirection.ASC if not values.get("null-order"): values["null-order"] = NullOrder.NULLS_FIRST if values["direction"] == SortDirection.ASC else NullOrder.NULLS_LAST @@ -144,7 +145,7 @@ class SortOrder(IcebergBaseModel): """ order_id: int = Field(alias="order-id", default=INITIAL_SORT_ORDER_ID) - fields: List[SortField] = Field(default_factory=list) + fields: list[SortField] = Field(default_factory=list) def __init__(self, *fields: SortField, **data: Any): if fields: diff --git a/pyiceberg/table/statistics.py b/pyiceberg/table/statistics.py index 25654d0c27..34185b980a 100644 --- a/pyiceberg/table/statistics.py +++ b/pyiceberg/table/statistics.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import Dict, List, Literal +from typing import Literal from pydantic import Field @@ -25,8 +25,8 @@ class BlobMetadata(IcebergBaseModel): type: Literal["apache-datasketches-theta-v1", "deletion-vector-v1"] snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") - fields: List[int] - properties: Dict[str, str] | None = None + fields: list[int] + properties: dict[str, str] | None = None class StatisticsCommonFields(IcebergBaseModel): @@ -40,7 +40,7 @@ class StatisticsCommonFields(IcebergBaseModel): class StatisticsFile(StatisticsCommonFields): file_footer_size_in_bytes: int = Field(alias="file-footer-size-in-bytes") key_metadata: str | None = Field(alias="key-metadata", default=None) - blob_metadata: List[BlobMetadata] = Field(alias="blob-metadata") + blob_metadata: list[BlobMetadata] = Field(alias="blob-metadata") class PartitionStatisticsFile(StatisticsCommonFields): @@ -48,7 +48,7 @@ class PartitionStatisticsFile(StatisticsCommonFields): def filter_statistics_by_snapshot_id( - statistics: List[StatisticsFile | PartitionStatisticsFile], + statistics: list[StatisticsFile | PartitionStatisticsFile], reject_snapshot_id: int, -) -> List[StatisticsFile | PartitionStatisticsFile]: +) -> list[StatisticsFile | PartitionStatisticsFile]: return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id] diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py index 6533719b05..a79e2cb468 100644 --- a/pyiceberg/table/update/__init__.py +++ b/pyiceberg/table/update/__init__.py @@ -21,7 +21,7 @@ from abc import ABC, abstractmethod from datetime import datetime from functools import singledispatch -from typing import TYPE_CHECKING, Annotated, Any, Dict, Generic, List, Literal, Tuple, TypeVar, cast +from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar, cast from pydantic import Field, field_validator, model_serializer, model_validator @@ -143,7 +143,7 @@ class SetSnapshotRefUpdate(IcebergBaseModel): class RemoveSnapshotsUpdate(IcebergBaseModel): action: Literal["remove-snapshots"] = Field(default="remove-snapshots") - snapshot_ids: List[int] = Field(alias="snapshot-ids") + snapshot_ids: list[int] = Field(alias="snapshot-ids") class RemoveSnapshotRefUpdate(IcebergBaseModel): @@ -158,16 +158,16 @@ class SetLocationUpdate(IcebergBaseModel): class SetPropertiesUpdate(IcebergBaseModel): action: Literal["set-properties"] = Field(default="set-properties") - updates: Dict[str, str] + updates: dict[str, str] @field_validator("updates", mode="before") - def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: + def transform_properties_dict_value_to_str(cls, properties: Properties) -> dict[str, str]: return transform_dict_value_to_str(properties) class RemovePropertiesUpdate(IcebergBaseModel): action: Literal["remove-properties"] = Field(default="remove-properties") - removals: List[str] + removals: list[str] class SetStatisticsUpdate(IcebergBaseModel): @@ -180,7 +180,7 @@ class SetStatisticsUpdate(IcebergBaseModel): ) @model_validator(mode="before") - def validate_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]: + def validate_snapshot_id(cls, data: dict[str, Any]) -> dict[str, Any]: stats = cast(StatisticsFile, data["statistics"]) data["snapshot_id"] = stats.snapshot_id @@ -195,12 +195,12 @@ class RemoveStatisticsUpdate(IcebergBaseModel): class RemovePartitionSpecsUpdate(IcebergBaseModel): action: Literal["remove-partition-specs"] = Field(default="remove-partition-specs") - spec_ids: List[int] = Field(alias="spec-ids") + spec_ids: list[int] = Field(alias="spec-ids") class RemoveSchemasUpdate(IcebergBaseModel): action: Literal["remove-schemas"] = Field(default="remove-schemas") - schema_ids: List[int] = Field(alias="schema-ids") + schema_ids: list[int] = Field(alias="schema-ids") class SetPartitionStatisticsUpdate(IcebergBaseModel): @@ -240,7 +240,7 @@ class RemovePartitionStatisticsUpdate(IcebergBaseModel): class _TableMetadataUpdateContext: - _updates: List[TableUpdate] + _updates: list[TableUpdate] def __init__(self) -> None: self._updates = [] @@ -345,7 +345,7 @@ def _(update: RemovePropertiesUpdate, base_metadata: TableMetadata, context: _Ta @_apply_table_update.register(AddSchemaUpdate) def _(update: AddSchemaUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "last_column_id": max(base_metadata.last_column_id, update.schema_.highest_field_id), "schemas": base_metadata.schemas + [update.schema_], } @@ -381,7 +381,7 @@ def _(update: AddPartitionSpecUpdate, base_metadata: TableMetadata, context: _Ta if spec.spec_id == update.spec.spec_id and spec != update.spec: raise ValueError(f"Partition spec with id {spec.spec_id} already exists: {spec}") - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "partition_specs": base_metadata.partition_specs + [update.spec], "last_partition_id": max( max([field.field_id for field in update.spec.fields], default=0), @@ -480,7 +480,7 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: _Tabl if snapshot is None: raise ValueError(f"Cannot set {update.ref_name} to unknown snapshot {snapshot_ref.snapshot_id}") - metadata_updates: Dict[str, Any] = {} + metadata_updates: dict[str, Any] = {} if context.is_added_snapshot(snapshot_ref.snapshot_id): metadata_updates["last_updated_ms"] = snapshot.timestamp_ms @@ -672,7 +672,7 @@ def _( def update_table_metadata( base_metadata: TableMetadata, - updates: Tuple[TableUpdate, ...], + updates: tuple[TableUpdate, ...], enforce_validation: bool = False, metadata_location: str | None = None, ) -> TableMetadata: @@ -732,7 +732,7 @@ def _update_table_metadata_log(base_metadata: TableMetadata, metadata_location: if len(base_metadata.metadata_log) >= max_metadata_log_entries: # type: ignore remove_index = len(base_metadata.metadata_log) - max_metadata_log_entries + 1 # type: ignore previous_metadata_log = base_metadata.metadata_log[remove_index:] - metadata_updates: Dict[str, Any] = { + metadata_updates: dict[str, Any] = { "metadata_log": previous_metadata_log + [MetadataLogEntry(metadata_file=metadata_location, timestamp_ms=last_updated_ms)] } return base_metadata.model_copy(update=metadata_updates) @@ -899,4 +899,4 @@ def validate(self, base_metadata: TableMetadata | None) -> None: Field(discriminator="type"), ] -UpdatesAndRequirements = Tuple[Tuple[TableUpdate, ...], Tuple[TableRequirement, ...]] +UpdatesAndRequirements = tuple[tuple[TableUpdate, ...], tuple[TableRequirement, ...]] diff --git a/pyiceberg/table/update/schema.py b/pyiceberg/table/update/schema.py index f28e0aa2ac..c2d99f6980 100644 --- a/pyiceberg/table/update/schema.py +++ b/pyiceberg/table/update/schema.py @@ -16,11 +16,12 @@ # under the License. from __future__ import annotations +import builtins import itertools from copy import copy from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.exceptions import ResolveError, ValidationError from pyiceberg.expressions import literal # type: ignore @@ -76,16 +77,16 @@ class _Move: class UpdateSchema(UpdateTableMetadata["UpdateSchema"]): _schema: Schema _last_column_id: itertools.count[int] - _identifier_field_names: Set[str] + _identifier_field_names: set[str] - _adds: Dict[int, List[NestedField]] = {} - _updates: Dict[int, NestedField] = {} - _deletes: Set[int] = set() - _moves: Dict[int, List[_Move]] = {} + _adds: dict[int, list[NestedField]] = {} + _updates: dict[int, NestedField] = {} + _deletes: set[int] = set() + _moves: dict[int, list[_Move]] = {} - _added_name_to_id: Dict[str, int] = {} + _added_name_to_id: dict[str, int] = {} # Part of https://github.com/apache/iceberg/pull/8393 - _id_to_parent: Dict[int, str] = {} + _id_to_parent: dict[int, str] = {} _allow_incompatible_changes: bool _case_sensitive: bool @@ -145,7 +146,7 @@ def case_sensitive(self, case_sensitive: bool) -> UpdateSchema: def union_by_name( # TODO: Move TableProperties.DEFAULT_FORMAT_VERSION to separate file and set that as format_version default. self, - new_schema: Schema | "pa.Schema", + new_schema: Schema | pa.Schema, format_version: TableVersion = 2, ) -> UpdateSchema: from pyiceberg.catalog import Catalog @@ -161,7 +162,7 @@ def union_by_name( def add_column( self, - path: str | Tuple[str, ...], + path: str | tuple[str, ...], field_type: IcebergType, doc: str | None = None, required: bool = False, @@ -257,7 +258,7 @@ def add_column( return self - def delete_column(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def delete_column(self, path: str | tuple[str, ...]) -> UpdateSchema: """Delete a column from a table. Args: @@ -280,7 +281,7 @@ def delete_column(self, path: str | Tuple[str, ...]) -> UpdateSchema: return self - def set_default_value(self, path: str | Tuple[str, ...], default_value: L | None) -> UpdateSchema: + def set_default_value(self, path: str | tuple[str, ...], default_value: L | None) -> UpdateSchema: """Set the default value of a column. Args: @@ -293,7 +294,7 @@ def set_default_value(self, path: str | Tuple[str, ...], default_value: L | None return self - def rename_column(self, path_from: str | Tuple[str, ...], new_name: str) -> UpdateSchema: + def rename_column(self, path_from: str | tuple[str, ...], new_name: str) -> UpdateSchema: """Update the name of a column. Args: @@ -339,7 +340,7 @@ def rename_column(self, path_from: str | Tuple[str, ...], new_name: str) -> Upda return self - def make_column_optional(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def make_column_optional(self, path: str | tuple[str, ...]) -> UpdateSchema: """Make a column optional. Args: @@ -354,7 +355,7 @@ def make_column_optional(self, path: str | Tuple[str, ...]) -> UpdateSchema: def set_identifier_fields(self, *fields: str) -> None: self._identifier_field_names = set(fields) - def _set_column_requirement(self, path: str | Tuple[str, ...], required: bool) -> None: + def _set_column_requirement(self, path: str | tuple[str, ...], required: bool) -> None: path = (path,) if isinstance(path, str) else path name = ".".join(path) @@ -391,7 +392,7 @@ def _set_column_requirement(self, path: str | Tuple[str, ...], required: bool) - write_default=field.write_default, ) - def _set_column_default_value(self, path: str | Tuple[str, ...], default_value: Any) -> None: + def _set_column_default_value(self, path: str | tuple[str, ...], default_value: Any) -> None: path = (path,) if isinstance(path, str) else path name = ".".join(path) @@ -437,7 +438,7 @@ def _set_column_default_value(self, path: str | Tuple[str, ...], default_value: def update_column( self, - path: str | Tuple[str, ...], + path: str | tuple[str, ...], field_type: IcebergType | None = None, required: bool | None = None, doc: str | None = None, @@ -534,7 +535,7 @@ def _move(self, move: _Move) -> None: self._moves[TABLE_ROOT_ID] = self._moves.get(TABLE_ROOT_ID, []) + [move] - def move_first(self, path: str | Tuple[str, ...]) -> UpdateSchema: + def move_first(self, path: str | tuple[str, ...]) -> UpdateSchema: """Move the field to the first position of the parent struct. Args: @@ -554,7 +555,7 @@ def move_first(self, path: str | Tuple[str, ...]) -> UpdateSchema: return self - def move_before(self, path: str | Tuple[str, ...], before_path: str | Tuple[str, ...]) -> UpdateSchema: + def move_before(self, path: str | tuple[str, ...], before_path: str | tuple[str, ...]) -> UpdateSchema: """Move the field to before another field. Args: @@ -588,7 +589,7 @@ def move_before(self, path: str | Tuple[str, ...], before_path: str | Tuple[str, return self - def move_after(self, path: str | Tuple[str, ...], after_name: str | Tuple[str, ...]) -> UpdateSchema: + def move_after(self, path: str | tuple[str, ...], after_name: str | tuple[str, ...]) -> UpdateSchema: """Move the field to after another field. Args: @@ -627,8 +628,8 @@ def _commit(self) -> UpdatesAndRequirements: (schema.schema_id for schema in self._transaction.table_metadata.schemas if schema == new_schema), None ) - requirements: Tuple[TableRequirement, ...] = () - updates: Tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () # Check if it is different current schema ID if existing_schema_id != self._schema.schema_id: @@ -694,17 +695,17 @@ def assign_new_column_id(self) -> int: class _ApplyChanges(SchemaVisitor[IcebergType | None]): - _adds: Dict[int, List[NestedField]] - _updates: Dict[int, NestedField] - _deletes: Set[int] - _moves: Dict[int, List[_Move]] + _adds: dict[int, builtins.list[NestedField]] + _updates: dict[int, NestedField] + _deletes: set[int] + _moves: dict[int, builtins.list[_Move]] def __init__( self, - adds: Dict[int, List[NestedField]], - updates: Dict[int, NestedField], - deletes: Set[int], - moves: Dict[int, List[_Move]], + adds: dict[int, builtins.list[NestedField]], + updates: dict[int, NestedField], + deletes: set[int], + moves: dict[int, builtins.list[_Move]], ) -> None: self._adds = adds self._updates = updates @@ -724,7 +725,7 @@ def schema(self, schema: Schema, struct_result: IcebergType | None) -> IcebergTy return struct_result - def struct(self, struct: StructType, field_results: List[IcebergType | None]) -> IcebergType | None: + def struct(self, struct: StructType, field_results: builtins.list[IcebergType | None]) -> IcebergType | None: has_changes = False new_fields = [] @@ -851,7 +852,7 @@ def __init__(self, update_schema: UpdateSchema, existing_schema: Schema, case_se def schema(self, schema: Schema, partner_id: int | None, struct_result: bool) -> bool: return struct_result - def struct(self, struct: StructType, partner_id: int | None, missing_positions: List[bool]) -> bool: + def struct(self, struct: StructType, partner_id: int | None, missing_positions: builtins.list[bool]) -> bool: if partner_id is None: return True @@ -873,7 +874,7 @@ def struct(self, struct: StructType, partner_id: int | None, missing_positions: def _add_column(self, parent_id: int, field: NestedField) -> None: if parent_name := self.existing_schema.find_column_name(parent_id): - path: Tuple[str, ...] = (parent_name, field.name) + path: tuple[str, ...] = (parent_name, field.name) else: path = (field.name,) @@ -997,12 +998,12 @@ def map_value_partner(self, partner_map_id: int | None) -> int | None: return None -def _add_fields(fields: Tuple[NestedField, ...], adds: List[NestedField] | None) -> Tuple[NestedField, ...]: +def _add_fields(fields: tuple[NestedField, ...], adds: list[NestedField] | None) -> tuple[NestedField, ...]: adds = adds or [] return fields + tuple(adds) -def _move_fields(fields: Tuple[NestedField, ...], moves: List[_Move]) -> Tuple[NestedField, ...]: +def _move_fields(fields: tuple[NestedField, ...], moves: list[_Move]) -> tuple[NestedField, ...]: reordered = list(copy(fields)) for move in moves: # Find the field that we're about to move @@ -1026,8 +1027,8 @@ def _move_fields(fields: Tuple[NestedField, ...], moves: List[_Move]) -> Tuple[N def _add_and_move_fields( - fields: Tuple[NestedField, ...], adds: List[NestedField], moves: List[_Move] -) -> Tuple[NestedField, ...] | None: + fields: tuple[NestedField, ...], adds: list[NestedField], moves: list[_Move] +) -> tuple[NestedField, ...] | None: if len(adds) > 0: # always apply adds first so that added fields can be moved added = _add_fields(fields, adds) diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index 191e4a9bff..e89cd45d34 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -21,10 +21,11 @@ import uuid from abc import abstractmethod from collections import defaultdict +from collections.abc import Callable from concurrent.futures import Future from datetime import datetime from functools import cached_property -from typing import TYPE_CHECKING, Callable, Dict, Generic, List, Set, Tuple +from typing import TYPE_CHECKING, Generic from sortedcontainers import SortedList @@ -106,9 +107,9 @@ class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]): _operation: Operation _snapshot_id: int _parent_snapshot_id: int | None - _added_data_files: List[DataFile] + _added_data_files: list[DataFile] _manifest_num_counter: itertools.count[int] - _deleted_data_files: Set[DataFile] + _deleted_data_files: set[DataFile] _compression: AvroCompressionCodec _target_branch: str | None @@ -118,7 +119,7 @@ def __init__( transaction: Transaction, io: FileIO, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, branch: str | None = MAIN_BRANCH, ) -> None: super().__init__(transaction) @@ -157,7 +158,7 @@ def delete_data_file(self, data_file: DataFile) -> _SnapshotProducer[U]: self._deleted_data_files.add(data_file) return self - def _calculate_added_rows(self, manifests: List[ManifestFile]) -> int: + def _calculate_added_rows(self, manifests: list[ManifestFile]) -> int: """Calculate the number of added rows from a list of manifest files.""" added_rows = 0 for manifest in manifests: @@ -171,17 +172,17 @@ def _calculate_added_rows(self, manifests: List[ManifestFile]) -> int: return added_rows @abstractmethod - def _deleted_entries(self) -> List[ManifestEntry]: ... + def _deleted_entries(self) -> list[ManifestEntry]: ... @abstractmethod - def _existing_manifests(self) -> List[ManifestFile]: ... + def _existing_manifests(self) -> list[ManifestFile]: ... - def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _process_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: """To perform any post-processing on the manifests before writing them to the new snapshot.""" return manifests - def _manifests(self) -> List[ManifestFile]: - def _write_added_manifest() -> List[ManifestFile]: + def _manifests(self) -> list[ManifestFile]: + def _write_added_manifest() -> list[ManifestFile]: if self._added_data_files: with write_manifest( format_version=self._transaction.table_metadata.format_version, @@ -205,12 +206,12 @@ def _write_added_manifest() -> List[ManifestFile]: else: return [] - def _write_delete_manifest() -> List[ManifestFile]: + def _write_delete_manifest() -> list[ManifestFile]: # Check if we need to mark the files as deleted deleted_entries = self._deleted_entries() if len(deleted_entries) > 0: deleted_manifests = [] - partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list) + partition_groups: dict[int, list[ManifestEntry]] = defaultdict(list) for deleted_entry in deleted_entries: partition_groups[deleted_entry.data_file.spec_id].append(deleted_entry) for spec_id, entries in partition_groups.items(): @@ -237,7 +238,7 @@ def _write_delete_manifest() -> List[ManifestFile]: return self._process_manifests(added_manifests.result() + delete_manifests.result() + existing_manifests.result()) - def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: + def _summary(self, snapshot_properties: dict[str, str] = EMPTY_DICT) -> Summary: from pyiceberg.table import TableProperties # avoid copying metadata for each data file @@ -364,7 +365,7 @@ def new_manifest_output(self) -> OutputFile: file_path = location_provider.new_metadata_location(file_name) return self._io.new_output(file_path) - def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: + def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> list[ManifestEntry]: return manifest.fetch_manifest_entry(io=self._io, discard_deleted=discard_deleted) @@ -388,7 +389,7 @@ def __init__( io: FileIO, branch: str | None = MAIN_BRANCH, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ): super().__init__(operation, transaction, io, commit_uuid, snapshot_properties, branch) self._predicate = AlwaysFalse() @@ -421,7 +422,7 @@ def delete_by_predicate(self, predicate: BooleanExpression, case_sensitive: bool self._case_sensitive = case_sensitive @cached_property - def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry], bool]: + def _compute_deletes(self) -> tuple[list[ManifestFile], list[ManifestEntry], bool]: """Computes all the delete operation and cache it when nothing changes. Returns: @@ -441,7 +442,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> data_file=entry.data_file, ) - manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) + manifest_evaluators: dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) strict_metrics_evaluator = _StrictMetricsEvaluator(schema, self._predicate, case_sensitive=self._case_sensitive).eval inclusive_metrics_evaluator = _InclusiveMetricsEvaluator( schema, self._predicate, case_sensitive=self._case_sensitive @@ -501,10 +502,10 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> return existing_manifests, total_deleted_entries, partial_rewrites_needed - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: return self._compute_deletes[0] - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: return self._compute_deletes[1] @property @@ -519,7 +520,7 @@ def files_affected(self) -> bool: class _FastAppendFiles(_SnapshotProducer["_FastAppendFiles"]): - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: """To determine if there are any existing manifest files. A fast append will add another ManifestFile to the ManifestList. @@ -539,7 +540,7 @@ def _existing_manifests(self) -> List[ManifestFile]: return existing_manifests - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: """To determine if we need to record any deleted manifest entries. In case of an append, nothing is deleted. @@ -559,7 +560,7 @@ def __init__( io: FileIO, branch: str | None = MAIN_BRANCH, commit_uuid: uuid.UUID | None = None, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ) -> None: from pyiceberg.table import TableProperties @@ -580,7 +581,7 @@ def __init__( TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT, ) - def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _process_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: """To perform any post-processing on the manifests before writing them to the new snapshot. In _MergeAppendFiles, we merge manifests based on the target size and the minimum count to merge @@ -605,7 +606,7 @@ class _OverwriteFiles(_SnapshotProducer["_OverwriteFiles"]): Data and delete files were added and removed in a logical overwrite operation. """ - def _existing_manifests(self) -> List[ManifestFile]: + def _existing_manifests(self) -> list[ManifestFile]: """Determine if there are any existing manifest files.""" existing_files = [] @@ -641,7 +642,7 @@ def _existing_manifests(self) -> List[ManifestFile]: existing_files.append(writer.to_manifest_file()) return existing_files - def _deleted_entries(self) -> List[ManifestEntry]: + def _deleted_entries(self) -> list[ManifestEntry]: """To determine if we need to record any deleted entries. With a full overwrite all the entries are considered deleted. @@ -656,7 +657,7 @@ def _deleted_entries(self) -> List[ManifestEntry]: executor = ExecutorFactory.get_or_create() - def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]: + def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]: return [ ManifestEntry.from_args( status=ManifestEntryStatus.DELETED, @@ -679,14 +680,14 @@ class UpdateSnapshot: _transaction: Transaction _io: FileIO _branch: str | None - _snapshot_properties: Dict[str, str] + _snapshot_properties: dict[str, str] def __init__( self, transaction: Transaction, io: FileIO, branch: str | None = MAIN_BRANCH, - snapshot_properties: Dict[str, str] = EMPTY_DICT, + snapshot_properties: dict[str, str] = EMPTY_DICT, ) -> None: self._transaction = transaction self._io = io @@ -747,13 +748,13 @@ def __init__( self._merge_enabled = merge_enabled self._snapshot_producer = snapshot_producer - def _group_by_spec(self, manifests: List[ManifestFile]) -> Dict[int, List[ManifestFile]]: + def _group_by_spec(self, manifests: list[ManifestFile]) -> dict[int, list[ManifestFile]]: groups = defaultdict(list) for manifest in manifests: groups[manifest.partition_spec_id].append(manifest) return groups - def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> ManifestFile: + def _create_manifest(self, spec_id: int, manifest_bin: list[ManifestFile]) -> ManifestFile: with self._snapshot_producer.new_manifest_writer(spec=self._snapshot_producer.spec(spec_id)) as writer: for manifest in manifest_bin: for entry in self._snapshot_producer.fetch_manifest_entry(manifest=manifest, discard_deleted=False): @@ -769,11 +770,11 @@ def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> Ma return writer.to_manifest_file() - def _merge_group(self, first_manifest: ManifestFile, spec_id: int, manifests: List[ManifestFile]) -> List[ManifestFile]: + def _merge_group(self, first_manifest: ManifestFile, spec_id: int, manifests: list[ManifestFile]) -> list[ManifestFile]: packer: ListPacker[ManifestFile] = ListPacker(target_weight=self._target_size_bytes, lookback=1, largest_bin_first=False) - bins: List[List[ManifestFile]] = packer.pack_end(manifests, lambda m: m.manifest_length) + bins: list[list[ManifestFile]] = packer.pack_end(manifests, lambda m: m.manifest_length) - def merge_bin(manifest_bin: List[ManifestFile]) -> List[ManifestFile]: + def merge_bin(manifest_bin: list[ManifestFile]) -> list[ManifestFile]: output_manifests = [] if len(manifest_bin) == 1: output_manifests.append(manifest_bin[0]) @@ -792,15 +793,15 @@ def merge_bin(manifest_bin: List[ManifestFile]) -> List[ManifestFile]: # for consistent ordering, we need to maintain future order futures_index = {f: i for i, f in enumerate(futures)} - completed_futures: SortedList[Future[List[ManifestFile]]] = SortedList(iterable=[], key=lambda f: futures_index[f]) + completed_futures: SortedList[Future[list[ManifestFile]]] = SortedList(iterable=[], key=lambda f: futures_index[f]) for future in concurrent.futures.as_completed(futures): completed_futures.add(future) - bin_results: List[List[ManifestFile]] = [f.result() for f in completed_futures if f.result()] + bin_results: list[list[ManifestFile]] = [f.result() for f in completed_futures if f.result()] return [manifest for bin_result in bin_results for manifest in bin_result] - def merge_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + def merge_manifests(self, manifests: list[ManifestFile]) -> list[ManifestFile]: if not self._merge_enabled or len(manifests) == 0: return manifests @@ -830,8 +831,8 @@ class ManageSnapshots(UpdateTableMetadata["ManageSnapshots"]): ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B") """ - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] def __init__(self, transaction: Transaction) -> None: super().__init__(transaction) @@ -949,9 +950,9 @@ class ExpireSnapshots(UpdateTableMetadata["ExpireSnapshots"]): Pending changes are applied on commit. """ - _updates: Tuple[TableUpdate, ...] - _requirements: Tuple[TableRequirement, ...] - _snapshot_ids_to_expire: Set[int] + _updates: tuple[TableUpdate, ...] + _requirements: tuple[TableRequirement, ...] + _snapshot_ids_to_expire: set[int] def __init__(self, transaction: Transaction) -> None: super().__init__(transaction) @@ -976,7 +977,7 @@ def _commit(self) -> UpdatesAndRequirements: self._updates += (update,) return self._updates, self._requirements - def _get_protected_snapshot_ids(self) -> Set[int]: + def _get_protected_snapshot_ids(self) -> set[int]: """ Get the IDs of protected snapshots. @@ -1012,7 +1013,7 @@ def by_id(self, snapshot_id: int) -> ExpireSnapshots: return self - def by_ids(self, snapshot_ids: List[int]) -> "ExpireSnapshots": + def by_ids(self, snapshot_ids: list[int]) -> ExpireSnapshots: """ Expire multiple snapshots by their IDs. @@ -1027,7 +1028,7 @@ def by_ids(self, snapshot_ids: List[int]) -> "ExpireSnapshots": self.by_id(snapshot_id) return self - def older_than(self, dt: datetime) -> "ExpireSnapshots": + def older_than(self, dt: datetime) -> ExpireSnapshots: """ Expire all unprotected snapshots with a timestamp older than a given value. diff --git a/pyiceberg/table/update/sorting.py b/pyiceberg/table/update/sorting.py index 7e931b1a33..e03d2264df 100644 --- a/pyiceberg/table/update/sorting.py +++ b/pyiceberg/table/update/sorting.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.table.sorting import INITIAL_SORT_ORDER_ID, UNSORTED_SORT_ORDER, NullOrder, SortDirection, SortField, SortOrder from pyiceberg.table.update import ( @@ -38,11 +38,11 @@ class UpdateSortOrder(UpdateTableMetadata["UpdateSortOrder"]): _transaction: Transaction _last_assigned_order_id: int | None _case_sensitive: bool - _fields: List[SortField] + _fields: list[SortField] def __init__(self, transaction: Transaction, case_sensitive: bool = True) -> None: super().__init__(transaction) - self._fields: List[SortField] = [] + self._fields: list[SortField] = [] self._case_sensitive: bool = case_sensitive self._last_assigned_order_id: int | None = None @@ -118,8 +118,8 @@ def _apply(self) -> SortOrder: def _commit(self) -> UpdatesAndRequirements: """Apply the pending changes and commit.""" new_sort_order = self._apply() - requirements: Tuple[TableRequirement, ...] = () - updates: Tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () if ( self._transaction.table_metadata.default_sort_order_id != new_sort_order.order_id diff --git a/pyiceberg/table/update/spec.py b/pyiceberg/table/update/spec.py index b1f5f83d8f..e060d6c261 100644 --- a/pyiceberg/table/update/spec.py +++ b/pyiceberg/table/update/spec.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Set, Tuple +from typing import TYPE_CHECKING, Any from pyiceberg.expressions import ( Reference, @@ -48,15 +48,15 @@ class UpdateSpec(UpdateTableMetadata["UpdateSpec"]): _transaction: Transaction - _name_to_field: Dict[str, PartitionField] = {} - _name_to_added_field: Dict[str, PartitionField] = {} - _transform_to_field: Dict[Tuple[int, str], PartitionField] = {} - _transform_to_added_field: Dict[Tuple[int, str], PartitionField] = {} - _renames: Dict[str, str] = {} - _added_time_fields: Dict[int, PartitionField] = {} + _name_to_field: dict[str, PartitionField] = {} + _name_to_added_field: dict[str, PartitionField] = {} + _transform_to_field: dict[tuple[int, str], PartitionField] = {} + _transform_to_added_field: dict[tuple[int, str], PartitionField] = {} + _renames: dict[str, str] = {} + _added_time_fields: dict[int, PartitionField] = {} _case_sensitive: bool - _adds: List[PartitionField] - _deletes: Set[int] + _adds: list[PartitionField] + _deletes: set[int] _last_assigned_partition_id: int def __init__(self, transaction: Transaction, case_sensitive: bool = True) -> None: @@ -157,8 +157,8 @@ def rename_field(self, name: str, new_name: str) -> UpdateSpec: def _commit(self) -> UpdatesAndRequirements: new_spec = self._apply() - updates: Tuple[TableUpdate, ...] = () - requirements: Tuple[TableRequirement, ...] = () + updates: tuple[TableUpdate, ...] = () + requirements: tuple[TableRequirement, ...] = () if self._transaction.table_metadata.default_spec_id != new_spec.spec_id: if new_spec.spec_id not in self._transaction.table_metadata.specs(): @@ -180,7 +180,7 @@ def _commit(self) -> UpdatesAndRequirements: def _apply(self) -> PartitionSpec: def _check_and_add_partition_name( - schema: Schema, name: str, source_id: int, transform: Transform[Any, Any], partition_names: Set[str] + schema: Schema, name: str, source_id: int, transform: Transform[Any, Any], partition_names: set[str] ) -> None: from pyiceberg.partitioning import validate_partition_name @@ -188,13 +188,13 @@ def _check_and_add_partition_name( partition_names.add(name) def _add_new_field( - schema: Schema, source_id: int, field_id: int, name: str, transform: Transform[Any, Any], partition_names: Set[str] + schema: Schema, source_id: int, field_id: int, name: str, transform: Transform[Any, Any], partition_names: set[str] ) -> PartitionField: _check_and_add_partition_name(schema, name, source_id, transform, partition_names) return PartitionField(source_id, field_id, transform, name) partition_fields = [] - partition_names: Set[str] = set() + partition_names: set[str] = set() for field in self._transaction.table_metadata.spec().fields: if field.field_id not in self._deletes: renamed = self._renames.get(field.name) @@ -267,7 +267,7 @@ def _add_new_field( new_spec_id = spec.spec_id + 1 return PartitionSpec(*partition_fields, spec_id=new_spec_id) - def _partition_field(self, transform_key: Tuple[int, Transform[Any, Any]], name: str | None) -> PartitionField: + def _partition_field(self, transform_key: tuple[int, Transform[Any, Any]], name: str | None) -> PartitionField: if self._transaction.table_metadata.format_version == 2: source_id, transform = transform_key historical_fields = [] diff --git a/pyiceberg/table/update/statistics.py b/pyiceberg/table/update/statistics.py index 5ba712e13d..76fe2cb07b 100644 --- a/pyiceberg/table/update/statistics.py +++ b/pyiceberg/table/update/statistics.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING from pyiceberg.table.statistics import StatisticsFile from pyiceberg.table.update import ( @@ -47,7 +47,7 @@ class UpdateStatistics(UpdateTableMetadata["UpdateStatistics"]): update.remove_statistics(snapshot_id=2) """ - _updates: Tuple[TableUpdate, ...] = () + _updates: tuple[TableUpdate, ...] = () def __init__(self, transaction: "Transaction") -> None: super().__init__(transaction) diff --git a/pyiceberg/table/update/validate.py b/pyiceberg/table/update/validate.py index 4ef3bcf160..0cda688f2a 100644 --- a/pyiceberg/table/update/validate.py +++ b/pyiceberg/table/update/validate.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import Iterator, Set +from collections.abc import Iterator from pyiceberg.exceptions import ValidationException from pyiceberg.expressions import BooleanExpression @@ -25,8 +25,8 @@ from pyiceberg.table.snapshots import Operation, Snapshot, ancestors_between from pyiceberg.typedef import Record -VALIDATE_DATA_FILES_EXIST_OPERATIONS: Set[Operation] = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE} -VALIDATE_ADDED_DATA_FILES_OPERATIONS: Set[Operation] = {Operation.APPEND, Operation.OVERWRITE} +VALIDATE_DATA_FILES_EXIST_OPERATIONS: set[Operation] = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE} +VALIDATE_ADDED_DATA_FILES_OPERATIONS: set[Operation] = {Operation.APPEND, Operation.OVERWRITE} def _validation_history( diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 98cfac1146..739e18a6e6 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -21,9 +21,10 @@ import struct import types from abc import ABC, abstractmethod +from collections.abc import Callable from enum import IntEnum from functools import singledispatch -from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar from typing import Literal as LiteralType from uuid import UUID @@ -183,10 +184,10 @@ def result_type(self, source: IcebergType) -> IcebergType: ... @abstractmethod - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: ... + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: ... @abstractmethod - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: ... + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: ... @property def preserves_order(self) -> bool: @@ -269,7 +270,7 @@ def apply(self, value: S | None) -> int | None: def result_type(self, source: IcebergType) -> IcebergType: return IntegerType() - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: transformer = self.transform(pred.term.ref().field.field_type) if isinstance(pred.term, BoundTransform): @@ -279,14 +280,14 @@ def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | elif isinstance(pred, BoundEqualTo): return pred.as_unbound(Reference(name), _transform_literal(transformer, pred.literal)) elif isinstance(pred, BoundIn): # NotIn can't be projected - return pred.as_unbound(Reference(name), {_transform_literal(transformer, literal) for literal in pred.literals}) + return pred.as_unbound(Reference(name), {_transform_literal(transformer, literal) for literal in pred.literals}) # type: ignore else: # - Comparison predicates can't be projected, notEq can't be projected # - Small ranges can be projected: # For example, (x > 0) and (x < 3) can be turned into in({1, 2}) and projected. - return None + return None # type: ignore - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: transformer = self.transform(pred.term.ref().field.field_type) if isinstance(pred.term, BoundTransform): @@ -296,10 +297,10 @@ def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredica elif isinstance(pred, BoundNotEqualTo): return pred.as_unbound(Reference(name), _transform_literal(transformer, pred.literal)) elif isinstance(pred, BoundNotIn): - return pred.as_unbound(Reference(name), {_transform_literal(transformer, literal) for literal in pred.literals}) + return pred.as_unbound(Reference(name), {_transform_literal(transformer, literal) for literal in pred.literals}) # type: ignore else: # no strict projection for comparison or equality - return None + return None # type: ignore def can_transform(self, source: IcebergType) -> bool: return isinstance( @@ -420,7 +421,7 @@ def result_type(self, source: IcebergType) -> IntegerType: @abstractmethod def transform(self, source: IcebergType) -> Callable[[Any | None], int | None]: ... - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: transformer = self.transform(pred.term.ref().field.field_type) if isinstance(pred.term, BoundTransform): return _project_transform_predicate(self, name, pred) @@ -430,10 +431,8 @@ def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | return _truncate_number(name, pred, transformer) elif isinstance(pred, BoundIn): # NotIn can't be projected return _set_apply_transform(name, pred, transformer) - else: - return None - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: transformer = self.transform(pred.term.ref().field.field_type) if isinstance(pred.term, BoundTransform): return _project_transform_predicate(self, name, pred) @@ -443,8 +442,6 @@ def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredica return _truncate_number_strict(name, pred, transformer) elif isinstance(pred, BoundNotIn): return _set_apply_transform(name, pred, transformer) - else: - return None @property def dedup_name(self) -> str: @@ -725,7 +722,7 @@ def can_transform(self, source: IcebergType) -> bool: def result_type(self, source: IcebergType) -> IcebergType: return source - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: if isinstance(pred.term, BoundTransform): return _project_transform_predicate(self, name, pred) elif isinstance(pred, BoundUnaryPredicate): @@ -737,7 +734,7 @@ def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | else: return None - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: if isinstance(pred, BoundUnaryPredicate): return pred.as_unbound(Reference(name)) elif isinstance(pred, BoundLiteralPredicate): @@ -801,7 +798,7 @@ def preserves_order(self) -> bool: def source_type(self) -> IcebergType: return self._source_type - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: field_type = pred.term.ref().field.field_type if isinstance(pred.term, BoundTransform): @@ -811,15 +808,14 @@ def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | return pred.as_unbound(Reference(name)) elif isinstance(pred, BoundIn): return _set_apply_transform(name, pred, self.transform(field_type)) - elif isinstance(field_type, (IntegerType, LongType, DecimalType)): + elif isinstance(field_type, (IntegerType, LongType, DecimalType)): # type: ignore if isinstance(pred, BoundLiteralPredicate): return _truncate_number(name, pred, self.transform(field_type)) elif isinstance(field_type, (BinaryType, StringType)): if isinstance(pred, BoundLiteralPredicate): return _truncate_array(name, pred, self.transform(field_type)) - return None - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: field_type = pred.term.ref().field.field_type if isinstance(pred.term, BoundTransform): @@ -834,7 +830,7 @@ def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredica elif isinstance(pred, BoundNotIn): return _set_apply_transform(name, pred, self.transform(field_type)) else: - return None + return None # type: ignore if isinstance(pred, BoundLiteralPredicate): if isinstance(pred, BoundStartsWith): @@ -859,7 +855,7 @@ def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredica elif isinstance(pred, BoundNotIn): return _set_apply_transform(name, pred, self.transform(field_type)) else: - return None + return None # type: ignore @property def width(self) -> int: @@ -987,10 +983,10 @@ def can_transform(self, source: IcebergType) -> bool: def result_type(self, source: IcebergType) -> StringType: return StringType() - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: return None - def strict_project(self, name: str, pred: BoundPredicate[Any]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: return None def __repr__(self) -> str: @@ -1015,10 +1011,10 @@ def can_transform(self, _: IcebergType) -> bool: def result_type(self, source: IcebergType) -> IcebergType: return source - def project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: return None - def strict_project(self, name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any] | None: + def strict_project(self, name: str, pred: BoundPredicate) -> UnboundPredicate | None: return None def to_human_string(self, _: IcebergType, value: S | None) -> str: @@ -1038,8 +1034,8 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr def _truncate_number( - name: str, pred: BoundLiteralPredicate[L], transform: Callable[[L | None], L | None] -) -> UnboundPredicate[Any] | None: + name: str, pred: BoundLiteralPredicate, transform: Callable[[Any | None], Any | None] +) -> UnboundPredicate | None: boundary = pred.literal if not isinstance(boundary, (LongLiteral, DecimalLiteral, DateLiteral, TimestampLiteral)): @@ -1060,8 +1056,8 @@ def _truncate_number( def _truncate_number_strict( - name: str, pred: BoundLiteralPredicate[L], transform: Callable[[L | None], L | None] -) -> UnboundPredicate[Any] | None: + name: str, pred: BoundLiteralPredicate, transform: Callable[[Any | None], Any | None] +) -> UnboundPredicate | None: boundary = pred.literal if not isinstance(boundary, (LongLiteral, DecimalLiteral, DateLiteral, TimestampLiteral)): @@ -1086,8 +1082,8 @@ def _truncate_number_strict( def _truncate_array_strict( - name: str, pred: BoundLiteralPredicate[L], transform: Callable[[L | None], L | None] -) -> UnboundPredicate[Any] | None: + name: str, pred: BoundLiteralPredicate, transform: Callable[[Any | None], Any | None] +) -> UnboundPredicate | None: boundary = pred.literal if isinstance(pred, (BoundLessThan, BoundLessThanOrEqual)): @@ -1101,8 +1097,8 @@ def _truncate_array_strict( def _truncate_array( - name: str, pred: BoundLiteralPredicate[L], transform: Callable[[L | None], L | None] -) -> UnboundPredicate[Any] | None: + name: str, pred: BoundLiteralPredicate, transform: Callable[[Any | None], Any | None] +) -> UnboundPredicate | None: boundary = pred.literal if isinstance(pred, (BoundLessThan, BoundLessThanOrEqual)): @@ -1120,26 +1116,26 @@ def _truncate_array( def _project_transform_predicate( - transform: Transform[Any, Any], partition_name: str, pred: BoundPredicate[L] -) -> UnboundPredicate[Any] | None: + transform: Transform[Any, Any], partition_name: str, pred: BoundPredicate +) -> UnboundPredicate | None: term = pred.term if isinstance(term, BoundTransform) and transform == term.transform: return _remove_transform(partition_name, pred) return None -def _remove_transform(partition_name: str, pred: BoundPredicate[L]) -> UnboundPredicate[Any]: +def _remove_transform(partition_name: str, pred: BoundPredicate) -> UnboundPredicate: if isinstance(pred, BoundUnaryPredicate): return pred.as_unbound(Reference(partition_name)) elif isinstance(pred, BoundLiteralPredicate): return pred.as_unbound(Reference(partition_name), pred.literal) elif isinstance(pred, (BoundIn, BoundNotIn)): - return pred.as_unbound(Reference(partition_name), pred.literals) + return pred.as_unbound(Reference(partition_name), pred.literals) # type: ignore else: raise ValueError(f"Cannot replace transform in unknown predicate: {pred}") -def _set_apply_transform(name: str, pred: BoundSetPredicate[L], transform: Callable[[L], L]) -> UnboundPredicate[Any]: +def _set_apply_transform(name: str, pred: BoundSetPredicate, transform: Callable[[Any], Any]) -> UnboundPredicate: literals = pred.literals if isinstance(pred, BoundSetPredicate): transformed_literals = {_transform_literal(transform, literal) for literal in literals} @@ -1148,11 +1144,11 @@ def _set_apply_transform(name: str, pred: BoundSetPredicate[L], transform: Calla raise ValueError(f"Unknown BoundSetPredicate: {pred}") -class BoundTransform(BoundTerm[L]): +class BoundTransform(BoundTerm): """A transform expression.""" - transform: Transform[L, Any] + transform: Transform[Any, Any] - def __init__(self, term: BoundTerm[L], transform: Transform[L, Any]): - self.term: BoundTerm[L] = term + def __init__(self, term: BoundTerm, transform: Transform[Any, Any]): + self.term: BoundTerm = term self.transform = transform diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 3480adcb32..80fc5303ad 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -17,19 +17,16 @@ from __future__ import annotations from abc import abstractmethod +from collections.abc import Callable from datetime import date, datetime, time from decimal import Decimal from typing import ( TYPE_CHECKING, Any, - Callable, - Dict, Generic, - List, Literal, Protocol, - Set, - Tuple, + TypeAlias, TypeVar, Union, runtime_checkable, @@ -37,13 +34,19 @@ from uuid import UUID from pydantic import BaseModel, ConfigDict, RootModel -from typing_extensions import Self, TypeAlias +from typing_extensions import Self if TYPE_CHECKING: + from pyiceberg.expressions.literals import Literal as IcebergLiteral from pyiceberg.types import StructType + LiteralValue = IcebergLiteral[Any] +else: + # Use Any for runtime to avoid circular import - type checkers will use TYPE_CHECKING version + LiteralValue = Any # type: ignore[assignment,misc] -class FrozenDict(Dict[Any, Any]): + +class FrozenDict(dict[Any, Any]): def __setitem__(self, instance: Any, value: Any) -> None: """Assign a value to a FrozenDict.""" raise AttributeError("FrozenDict does not support assignment") @@ -61,7 +64,7 @@ def update(self, *args: Any, **kwargs: Any) -> None: # from https://stackoverflow.com/questions/2912231/is-there-a-clever-way-to-pass-the-key-to-defaultdicts-default-factory -class KeyDefaultDict(Dict[K, V]): +class KeyDefaultDict(dict[K, V]): def __init__(self, default_factory: Callable[[K], V]): super().__init__() self.default_factory = default_factory @@ -73,7 +76,7 @@ def __missing__(self, key: K) -> V: return val -Identifier = Tuple[str, ...] +Identifier = tuple[str, ...] """A tuple of strings representing a table identifier. Each string in the tuple represents a part of the table's unique path. For example, @@ -85,11 +88,11 @@ def __missing__(self, key: K) -> V: >>> identifier: Identifier = ("namespace", "table_name") """ -Properties = Dict[str, Any] +Properties = dict[str, Any] """A dictionary type for properties in PyIceberg.""" -RecursiveDict = Dict[str, Union[str, "RecursiveDict"]] +RecursiveDict = dict[str, Union[str, "RecursiveDict"]] """A recursive dictionary type for nested structures in PyIceberg.""" # Represents the literal value @@ -126,7 +129,7 @@ class IcebergBaseModel(BaseModel): model_config = ConfigDict(populate_by_name=True, frozen=True) - def _exclude_private_properties(self, exclude: Set[str] | None = None) -> Set[str]: + def _exclude_private_properties(self, exclude: set[str] | None = None) -> set[str]: # A small trick to exclude private properties. Properties are serialized by pydantic, # regardless if they start with an underscore. # This will look at the dict, and find the fields and exclude them @@ -135,14 +138,14 @@ def _exclude_private_properties(self, exclude: Set[str] | None = None) -> Set[st ) def model_dump( - self, exclude_none: bool = True, exclude: Set[str] | None = None, by_alias: bool = True, **kwargs: Any - ) -> Dict[str, Any]: + self, exclude_none: bool = True, exclude: set[str] | None = None, by_alias: bool = True, **kwargs: Any + ) -> dict[str, Any]: return super().model_dump( exclude_none=exclude_none, exclude=self._exclude_private_properties(exclude), by_alias=by_alias, **kwargs ) def model_dump_json( - self, exclude_none: bool = True, exclude: Set[str] | None = None, by_alias: bool = True, **kwargs: Any + self, exclude_none: bool = True, exclude: set[str] | None = None, by_alias: bool = True, **kwargs: Any ) -> str: return super().model_dump_json( exclude_none=exclude_none, exclude=self._exclude_private_properties(exclude), by_alias=by_alias, **kwargs @@ -172,7 +175,7 @@ class IcebergRootModel(RootModel[T], Generic[T]): class Record(StructProtocol): __slots__ = ("_data",) - _data: List[Any] + _data: list[Any] @classmethod def _bind(cls, struct: StructType, **arguments: Any) -> Self: diff --git a/pyiceberg/types.py b/pyiceberg/types.py index c22bee092f..742da00f57 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -38,9 +38,7 @@ Annotated, Any, ClassVar, - Dict, Literal, - Tuple, ) from pydantic import ( @@ -64,7 +62,7 @@ FIXED_PARSER = ParseNumberFromBrackets(FIXED) -def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]: +def transform_dict_value_to_str(dict: dict[str, Any]) -> dict[str, str]: """Transform all values in the dictionary to string. Raise an error if any value is None.""" for key, value in dict.items(): if value is None: @@ -72,7 +70,7 @@ def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]: return {k: str(v).lower() if isinstance(v, bool) else str(v) for k, v in dict.items()} -def _parse_decimal_type(decimal: Any) -> Tuple[int, int]: +def _parse_decimal_type(decimal: Any) -> tuple[int, int]: if isinstance(decimal, str): matches = DECIMAL_REGEX.search(decimal) if matches: @@ -250,7 +248,7 @@ class DecimalType(PrimitiveType): True """ - root: Tuple[int, int] + root: tuple[int, int] def __init__(self, precision: int, scale: int) -> None: super().__init__(root=(precision, scale)) @@ -282,7 +280,7 @@ def __hash__(self) -> int: """Return the hash of the tuple.""" return hash(self.root) - def __getnewargs__(self) -> Tuple[int, int]: + def __getnewargs__(self) -> tuple[int, int]: """Pickle the DecimalType class.""" return self.precision, self.scale @@ -375,7 +373,7 @@ def __init__( super().__init__(**data) @model_serializer() - def serialize_model(self) -> Dict[str, Any]: + def serialize_model(self) -> dict[str, Any]: from pyiceberg.conversions import to_json fields = { @@ -415,7 +413,7 @@ def __repr__(self) -> str: return f"NestedField({', '.join(parts)})" - def __getnewargs__(self) -> Tuple[int, str, IcebergType, bool, str | None]: + def __getnewargs__(self) -> tuple[int, str, IcebergType, bool, str | None]: """Pickle the NestedField class.""" return (self.field_id, self.name, self.field_type, self.required, self.doc) @@ -436,7 +434,7 @@ class StructType(IcebergType): """ type: Literal["struct"] = Field(default="struct") - fields: Tuple[NestedField, ...] = Field(default_factory=tuple) + fields: tuple[NestedField, ...] = Field(default_factory=tuple) _hash: int = PrivateAttr() def __init__(self, *fields: NestedField, **data: Any): @@ -476,7 +474,7 @@ def __len__(self) -> int: """Return the length of an instance of the StructType class.""" return len(self.fields) - def __getnewargs__(self) -> Tuple[NestedField, ...]: + def __getnewargs__(self) -> tuple[NestedField, ...]: """Pickle the StructType class.""" return self.fields @@ -526,7 +524,7 @@ def __str__(self) -> str: """Return the string representation of the ListType class.""" return f"list<{self.element_type}>" - def __getnewargs__(self) -> Tuple[int, IcebergType, bool]: + def __getnewargs__(self) -> tuple[int, IcebergType, bool]: """Pickle the ListType class.""" return (self.element_id, self.element_type, self.element_required) @@ -594,7 +592,7 @@ def __str__(self) -> str: """Return the string representation of the MapType class.""" return f"map<{self.key_type}, {self.value_type}>" - def __getnewargs__(self) -> Tuple[int, IcebergType, int, IcebergType, bool]: + def __getnewargs__(self) -> tuple[int, IcebergType, int, IcebergType, bool]: """Pickle the MapType class.""" return (self.key_id, self.key_type, self.value_id, self.value_type, self.value_required) diff --git a/pyiceberg/utils/bin_packing.py b/pyiceberg/utils/bin_packing.py index 825420d8b7..dc2987c3a4 100644 --- a/pyiceberg/utils/bin_packing.py +++ b/pyiceberg/utils/bin_packing.py @@ -16,11 +16,9 @@ # under the License. from __future__ import annotations +from collections.abc import Callable, Iterable from typing import ( - Callable, Generic, - Iterable, - List, TypeVar, ) @@ -31,7 +29,7 @@ class Bin(Generic[T]): def __init__(self, target_weight: int) -> None: self.bin_weight = 0 self.target_weight = target_weight - self.items: List[T] = [] + self.items: list[T] = [] def weight(self) -> int: return self.bin_weight @@ -45,7 +43,7 @@ def add(self, item: T, weight: int) -> None: class PackingIterator(Generic[T]): - bins: List[Bin[T]] + bins: list[Bin[T]] def __init__( self, @@ -66,7 +64,7 @@ def __iter__(self) -> PackingIterator[T]: """Return an iterator for the PackingIterator class.""" return self - def __next__(self) -> List[T]: + def __next__(self) -> list[T]: """Return the next item when iterating over the PackingIterator class.""" while True: try: @@ -115,7 +113,7 @@ def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) - self._lookback = lookback self._largest_bin_first = largest_bin_first - def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + def pack(self, items: list[T], weight_func: Callable[[T], int]) -> list[list[T]]: return list( PackingIterator( items=items, @@ -126,6 +124,6 @@ def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]] ) ) - def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + def pack_end(self, items: list[T], weight_func: Callable[[T], int]) -> list[list[T]]: packed = self.pack(items=list(reversed(items)), weight_func=weight_func) return [list(reversed(bin_items)) for bin_items in reversed(packed)] diff --git a/pyiceberg/utils/config.py b/pyiceberg/utils/config.py index 98fb292369..ab9b549d25 100644 --- a/pyiceberg/utils/config.py +++ b/pyiceberg/utils/config.py @@ -16,7 +16,6 @@ # under the License. import logging import os -from typing import List import strictyaml @@ -106,7 +105,7 @@ def _from_environment_variables(config: RecursiveDict) -> RecursiveDict: Amended configuration. """ - def set_property(_config: RecursiveDict, path: List[str], config_value: str) -> None: + def set_property(_config: RecursiveDict, path: list[str], config_value: str) -> None: while len(path) > 0: element = path.pop(0) if len(path) == 0: @@ -159,7 +158,7 @@ def get_catalog_config(self, catalog_name: str) -> RecursiveDict | None: return catalog_conf return None - def get_known_catalogs(self) -> List[str]: + def get_known_catalogs(self) -> list[str]: catalogs = self.config.get(CATALOG, {}) if not isinstance(catalogs, dict): raise ValueError("Catalog configurations needs to be an object") diff --git a/pyiceberg/utils/deprecated.py b/pyiceberg/utils/deprecated.py index accbd9d5fe..b5bb6d01cd 100644 --- a/pyiceberg/utils/deprecated.py +++ b/pyiceberg/utils/deprecated.py @@ -16,7 +16,8 @@ # under the License. import functools import warnings -from typing import Any, Callable +from collections.abc import Callable +from typing import Any def deprecated(deprecated_in: str, removed_in: str, help_message: str | None = None) -> Callable: # type: ignore diff --git a/pyiceberg/utils/lazydict.py b/pyiceberg/utils/lazydict.py index 4b616c5c27..b110b6291b 100644 --- a/pyiceberg/utils/lazydict.py +++ b/pyiceberg/utils/lazydict.py @@ -15,11 +15,8 @@ # specific language governing permissions and limitations # under the License. +from collections.abc import Iterator, Mapping, Sequence from typing import ( - Dict, - Iterator, - Mapping, - Sequence, TypeVar, cast, ) @@ -41,9 +38,9 @@ class LazyDict(Mapping[K, V]): # that the developer has correctly used the class and that the contents are valid. def __init__(self, contents: Sequence[Sequence[K | V]]): self._contents = contents - self._dict: Dict[K, V] | None = None + self._dict: dict[K, V] | None = None - def _build_dict(self) -> Dict[K, V]: + def _build_dict(self) -> dict[K, V]: self._dict = {} for item in self._contents: self._dict.update(dict(zip(cast(Sequence[K], item[::2]), cast(Sequence[V], item[1::2]), strict=True))) @@ -65,6 +62,6 @@ def __len__(self) -> int: source = self._dict or self._build_dict() return len(source) - def __dict__(self) -> Dict[K, V]: # type: ignore + def __dict__(self) -> dict[K, V]: # type: ignore """Convert the lazy dict in a dict.""" return self._dict or self._build_dict() diff --git a/pyiceberg/utils/properties.py b/pyiceberg/utils/properties.py index 11241e485c..2a95b39a50 100644 --- a/pyiceberg/utils/properties.py +++ b/pyiceberg/utils/properties.py @@ -17,7 +17,6 @@ from typing import ( Any, - Dict, ) from pyiceberg.typedef import Properties @@ -27,7 +26,7 @@ def property_as_int( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: int | None = None, ) -> int | None: @@ -41,7 +40,7 @@ def property_as_int( def property_as_float( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: float | None = None, ) -> float | None: @@ -55,7 +54,7 @@ def property_as_float( def property_as_bool( - properties: Dict[str, str], + properties: dict[str, str], property_name: str, default: bool, ) -> bool: diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py index 0ec8dce084..66e57d5d9f 100644 --- a/pyiceberg/utils/schema_conversion.py +++ b/pyiceberg/utils/schema_conversion.py @@ -19,9 +19,6 @@ import logging from typing import ( Any, - Dict, - List, - Tuple, ) from pyiceberg.schema import ( @@ -59,7 +56,7 @@ logger = logging.getLogger(__name__) -PRIMITIVE_FIELD_TYPE_MAPPING: Dict[str, PrimitiveType] = { +PRIMITIVE_FIELD_TYPE_MAPPING: dict[str, PrimitiveType] = { "boolean": BooleanType(), "bytes": BinaryType(), "double": DoubleType(), @@ -71,7 +68,7 @@ "null": UnknownType(), } -LOGICAL_FIELD_TYPE_MAPPING: Dict[Tuple[str, str], PrimitiveType] = { +LOGICAL_FIELD_TYPE_MAPPING: dict[tuple[str, str], PrimitiveType] = { ("date", "int"): DateType(), ("time-micros", "long"): TimeType(), ("timestamp-micros", "long"): TimestampType(), @@ -83,7 +80,7 @@ class AvroSchemaConversion: - def avro_to_iceberg(self, avro_schema: Dict[str, Any]) -> Schema: + def avro_to_iceberg(self, avro_schema: dict[str, Any]) -> Schema: """Convert an Apache Avro into an Apache Iceberg schema equivalent. This expects to have field id's to be encoded in the Avro schema: @@ -132,7 +129,7 @@ def iceberg_to_avro(self, schema: Schema, schema_name: str | None = None) -> Avr """Convert an Iceberg schema into an Avro dictionary that can be serialized to JSON.""" return visit(schema, ConvertSchemaToAvro(schema_name)) - def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] | str) -> Tuple[str | Dict[str, Any], bool]: + def _resolve_union(self, type_union: dict[str, str] | list[str | dict[str, str]] | str) -> tuple[str | dict[str, Any], bool]: """ Convert Unions into their type and resolves if the field is required. @@ -155,7 +152,7 @@ def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] Raises: TypeError: In the case non-optional union types are encountered. """ - avro_types: Dict[str, str] | List[Dict[str, str] | str] + avro_types: dict[str, str] | list[dict[str, str] | str] if isinstance(type_union, str): # It is a primitive and required return type_union, True @@ -181,7 +178,7 @@ def _resolve_union(self, type_union: Dict[str, str] | List[str | Dict[str, str]] # Filter the null value and return the type return list(filter(lambda t: t != "null", avro_types))[0], False - def _convert_schema(self, avro_type: str | Dict[str, Any]) -> IcebergType: + def _convert_schema(self, avro_type: str | dict[str, Any]) -> IcebergType: """ Resolve the Avro type. @@ -219,7 +216,7 @@ def _convert_schema(self, avro_type: str | Dict[str, Any]) -> IcebergType: else: raise TypeError(f"Type not recognized: {avro_type}") - def _convert_field(self, field: Dict[str, Any]) -> NestedField: + def _convert_field(self, field: dict[str, Any]) -> NestedField: """Convert an Avro field into an Iceberg equivalent field. Args: @@ -241,7 +238,7 @@ def _convert_field(self, field: Dict[str, Any]) -> NestedField: doc=field.get("doc"), ) - def _convert_record_type(self, record_type: Dict[str, Any]) -> StructType: + def _convert_record_type(self, record_type: dict[str, Any]) -> StructType: """ Convert the fields from a record into an Iceberg struct. @@ -295,7 +292,7 @@ def _convert_record_type(self, record_type: Dict[str, Any]) -> StructType: return StructType(*[self._convert_field(field) for field in record_type["fields"]]) - def _convert_array_type(self, array_type: Dict[str, Any]) -> ListType: + def _convert_array_type(self, array_type: dict[str, Any]) -> ListType: if "element-id" not in array_type: raise ValueError(f"Cannot convert array-type, missing element-id: {array_type}") @@ -307,7 +304,7 @@ def _convert_array_type(self, array_type: Dict[str, Any]) -> ListType: element_required=element_required, ) - def _convert_map_type(self, map_type: Dict[str, Any]) -> MapType: + def _convert_map_type(self, map_type: dict[str, Any]) -> MapType: """Convert an avro map type into an Iceberg MapType. Args: @@ -344,7 +341,7 @@ def _convert_map_type(self, map_type: Dict[str, Any]) -> MapType: value_required=value_required, ) - def _convert_logical_type(self, avro_logical_type: Dict[str, Any]) -> IcebergType: + def _convert_logical_type(self, avro_logical_type: dict[str, Any]) -> IcebergType: """Convert a schema with a logical type annotation into an IcebergType. For the decimal and map we need to fetch more keys from the dict, and for @@ -385,7 +382,7 @@ def _convert_logical_type(self, avro_logical_type: Dict[str, Any]) -> IcebergTyp else: raise ValueError(f"Unknown logical/physical type combination: {avro_logical_type}") - def _convert_logical_decimal_type(self, avro_type: Dict[str, Any]) -> DecimalType: + def _convert_logical_decimal_type(self, avro_type: dict[str, Any]) -> DecimalType: """Convert an avro type to an Iceberg DecimalType. Args: @@ -412,7 +409,7 @@ def _convert_logical_decimal_type(self, avro_type: Dict[str, Any]) -> DecimalTyp """ return DecimalType(precision=avro_type["precision"], scale=avro_type["scale"]) - def _convert_logical_map_type(self, avro_type: Dict[str, Any]) -> MapType: + def _convert_logical_map_type(self, avro_type: dict[str, Any]) -> MapType: """Convert an avro map type to an Iceberg MapType. In the case where a map hasn't a key as a type you can use a logical map to still encode this in Avro. @@ -464,7 +461,7 @@ def _convert_logical_map_type(self, avro_type: Dict[str, Any]) -> MapType: value_required=value.required, ) - def _convert_fixed_type(self, avro_type: Dict[str, Any]) -> FixedType: + def _convert_fixed_type(self, avro_type: dict[str, Any]) -> FixedType: """ Convert Avro Type to the equivalent Iceberg fixed type. @@ -519,7 +516,7 @@ def before_map_key(self, key: NestedField) -> None: def before_map_value(self, value: NestedField) -> None: self.last_map_value_field_id = value.field_id - def struct(self, struct: StructType, field_results: List[AvroType]) -> AvroType: + def struct(self, struct: StructType, field_results: list[AvroType]) -> AvroType: return {"type": "record", "fields": field_results} def field(self, field: NestedField, field_result: AvroType) -> AvroType: diff --git a/pyiceberg/utils/singleton.py b/pyiceberg/utils/singleton.py index 06ee62febe..b59f43fbcd 100644 --- a/pyiceberg/utils/singleton.py +++ b/pyiceberg/utils/singleton.py @@ -28,7 +28,7 @@ More information on metaclasses: https://docs.python.org/3/reference/datamodel.html#metaclasses """ -from typing import Any, ClassVar, Dict +from typing import Any, ClassVar def _convert_to_hashable_type(element: Any) -> Any: @@ -40,7 +40,7 @@ def _convert_to_hashable_type(element: Any) -> Any: class Singleton: - _instances: ClassVar[Dict] = {} # type: ignore + _instances: ClassVar[dict] = {} # type: ignore def __new__(cls, *args, **kwargs): # type: ignore key = (cls, tuple(args), _convert_to_hashable_type(kwargs)) @@ -48,7 +48,7 @@ def __new__(cls, *args, **kwargs): # type: ignore cls._instances[key] = super().__new__(cls) return cls._instances[key] - def __deepcopy__(self, memo: Dict[int, Any]) -> Any: + def __deepcopy__(self, memo: dict[int, Any]) -> Any: """ Prevent deep copy operations for singletons. diff --git a/pyproject.toml b/pyproject.toml index 6d35803ac6..05f4af5f5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,25 +45,29 @@ dependencies = [ "zstandard>=0.13.0,<1.0.0" ] +[project.urls] +Homepage = "https://py.iceberg.apache.org/" +Repository = "https://github.com/apache/iceberg-python" + +[project.scripts] +pyiceberg = "pyiceberg.cli.console:run" + [project.optional-dependencies] pyarrow = [ "pyarrow>=17.0.0", "pyiceberg-core>=0.5.1,<0.8.0", ] pandas = [ - "pyarrow>=17.0.0", - "pyiceberg-core>=0.5.1,<0.8.0", "pandas>=1.0.0,<3.0.0", + "pyarrow>=17.0.0", ] duckdb = [ - "pyarrow>=17.0.0", - "pyiceberg-core>=0.5.1,<0.8.0", "duckdb>=0.5.0,<2.0.0", + "pyarrow>=17.0.0", ] ray = [ - "pyarrow>=17.0.0", - "pyiceberg-core>=0.5.1,<0.8.0", "ray>=2.10.0,<=2.44.0", + "pyarrow>=17.0.0", "pandas>=1.0.0,<3.0.0", ] bodo = ["bodo>=2025.7.4"] @@ -93,13 +97,6 @@ pyiceberg-core = ["pyiceberg-core>=0.5.1,<0.8.0"] datafusion = ["datafusion>=45,<49"] gcp-auth = ["google-auth>=2.4.0"] -[project.urls] -Homepage = "https://py.iceberg.apache.org/" -Repository = "https://github.com/apache/iceberg-python" - -[project.scripts] -pyiceberg = "pyiceberg.cli.console:run" - [dependency-groups] dev = [ "pytest==7.4.4", @@ -113,12 +110,13 @@ dev = [ "typing-extensions==4.15.0", "pytest-mock==3.15.1", "pyspark[connect]==4.0.1", - "protobuf==6.33.0", # match Spark Connect's gencode - "cython==3.2.0", + "protobuf==6.33.1", # match Spark Connect's gencode + "cython==3.2.1", "deptry>=0.14,<0.24", "docutils!=0.21.post1", "mypy-boto3-glue>=1.28.18", "mypy-boto3-dynamodb>=1.28.18", + "pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0 https://github.com/apache/arrow/pull/47609 ] # for mkdocs docs = [ @@ -130,7 +128,7 @@ docs = [ "mkdocs-literate-nav==0.6.2", "mkdocs-autorefs==1.4.3", "mkdocs-gen-files==0.5.0", - "mkdocs-material==9.6.23", + "mkdocs-material==9.7.0", "mkdocs-material-extensions==1.3.1", "mkdocs-section-index==0.3.10", ] diff --git a/ruff.toml b/ruff.toml index ab0ef4f90c..1ce052288e 100644 --- a/ruff.toml +++ b/ruff.toml @@ -58,12 +58,7 @@ select = [ "UP", # pyupgrade ] ignore = [ - "E501", - "B024", - "B028", - "UP037", - "UP035", - "UP006" + "E501" ] # Allow autofix for all enabled rules (when `--fix`) is provided. diff --git a/setup.py b/setup.py index 6f05a300fd..34eee94bbd 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,23 @@ import os from setuptools import Extension, find_packages, setup +from setuptools.command.sdist import sdist as _sdist + + +class sdist(_sdist): + """Custom sdist that excludes .egg-info and setup.cfg.""" + + def make_release_tree(self, base_dir: str, files: list[str]) -> None: + # Filter egg-info from the file manifest + files = [f for f in files if ".egg-info" not in f] + + super().make_release_tree(base_dir, files) + + # Remove setup.cfg after setuptools creates it + setup_cfg = os.path.join(base_dir, "setup.cfg") + if os.path.exists(setup_cfg): + os.remove(setup_cfg) + allowed_to_fail = os.environ.get("CIBUILDWHEEL", "0") != "1" @@ -69,4 +86,5 @@ }, include_package_data=True, ext_modules=ext_modules, + cmdclass={"sdist": sdist}, ) diff --git a/tests/avro/test_decoder.py b/tests/avro/test_decoder.py index c7c64ea096..163ad8405e 100644 --- a/tests/avro/test_decoder.py +++ b/tests/avro/test_decoder.py @@ -18,9 +18,9 @@ import itertools import struct +from collections.abc import Callable from io import SEEK_SET from types import TracebackType -from typing import Callable, Type from unittest.mock import MagicMock, patch import pytest @@ -129,7 +129,7 @@ def close(self) -> None: def __enter__(self) -> OneByteAtATimeInputStream: return self - def __exit__(self, exctype: Type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: + def __exit__(self, exctype: type[BaseException] | None, excinst: BaseException | None, exctb: TracebackType | None) -> None: self.close() diff --git a/tests/avro/test_reader.py b/tests/avro/test_reader.py index c47b27e581..320fc33793 100644 --- a/tests/avro/test_reader.py +++ b/tests/avro/test_reader.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=protected-access import json -from typing import Callable +from collections.abc import Callable import pytest diff --git a/tests/catalog/integration_test_dynamodb.py b/tests/catalog/integration_test_dynamodb.py index 895f233c45..6ae14bca06 100644 --- a/tests/catalog/integration_test_dynamodb.py +++ b/tests/catalog/integration_test_dynamodb.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Generator, List +from collections.abc import Generator import boto3 import pytest @@ -115,7 +115,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database assert table.metadata == loaded_table.metadata -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -204,7 +204,7 @@ def test_create_namespace_with_comment_and_location(test_catalog: Catalog, datab assert properties["location"] == test_location -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 475fc07ead..c429770268 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -16,7 +16,8 @@ # under the License. import time -from typing import Any, Dict, Generator, List +from collections.abc import Generator +from typing import Any from uuid import uuid4 import boto3 @@ -70,7 +71,7 @@ def __init__(self) -> None: self._output_bucket = get_bucket_name() self._output_path = f"athena_results_{uuid4()}" - def get_query_results(self, query: str) -> List[Dict[str, Any]]: + def get_query_results(self, query: str) -> list[dict[str, Any]]: query_execution_id = self._athena_client.start_query_execution( QueryString=query, ResultConfiguration={"OutputLocation": f"s3://{self._output_bucket}/{self._output_path}"} )["QueryExecutionId"] @@ -222,7 +223,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, table_na assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 0 -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -312,7 +313,7 @@ def test_create_namespace_with_comment_and_location(test_catalog: Catalog, datab assert properties["location"] == test_location -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index c7c39a600d..5933e7d472 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import List from unittest import mock import boto3 @@ -393,7 +392,7 @@ def test_fail_on_rename_non_iceberg_table( @mock_aws def test_list_tables( - _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_list: List[str] + _bucket_initialize: None, moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, table_list: list[str] ) -> None: test_catalog = DynamoDbCatalog("test_ddb_catalog", **{"warehouse": f"s3://{BUCKET_NAME}", "s3.endpoint": moto_endpoint_url}) test_catalog.create_namespace(namespace=database_name) @@ -405,7 +404,7 @@ def test_list_tables( @mock_aws -def test_list_namespaces(_bucket_initialize: None, database_list: List[str]) -> None: +def test_list_namespaces(_bucket_initialize: None, database_list: list[str]) -> None: test_catalog = DynamoDbCatalog("test_ddb_catalog") for database_name in database_list: test_catalog.create_namespace(namespace=database_name) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 0ff43cd52b..5273db22f8 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import List from unittest import mock import boto3 @@ -439,7 +438,7 @@ def test_list_tables( moto_endpoint_url: str, table_schema_nested: Schema, database_name: str, - table_list: List[str], + table_list: list[str], ) -> None: test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}/"}) test_catalog.create_namespace(namespace=database_name) @@ -475,7 +474,7 @@ def test_list_tables( @mock_aws -def test_list_namespaces(_bucket_initialize: None, moto_endpoint_url: str, database_list: List[str]) -> None: +def test_list_namespaces(_bucket_initialize: None, moto_endpoint_url: str, database_list: list[str]) -> None: test_catalog = GlueCatalog("glue", **{"s3.endpoint": moto_endpoint_url}) for database_name in database_list: test_catalog.create_namespace(namespace=database_name) diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index d2ecd02aba..b8bee00225 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -17,7 +17,8 @@ # pylint: disable=redefined-outer-name,unused-argument import base64 import os -from typing import Any, Callable, Dict, cast +from collections.abc import Callable +from typing import Any, cast from unittest import mock import pytest @@ -69,7 +70,7 @@ @pytest.fixture -def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_with_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_with_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata-location": "s3://warehouse/database/table/metadata/00001-5f2f8166-244c-4eae-ac36-384ecdec81fc.gz.metadata.json", "metadata": example_table_metadata_with_snapshot_v1, @@ -81,7 +82,7 @@ def example_table_metadata_with_snapshot_v1_rest_json(example_table_metadata_wit @pytest.fixture -def example_table_metadata_with_no_location(example_table_metadata_with_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_with_no_location(example_table_metadata_with_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata": example_table_metadata_with_snapshot_v1, "config": { @@ -92,7 +93,7 @@ def example_table_metadata_with_no_location(example_table_metadata_with_snapshot @pytest.fixture -def example_table_metadata_no_snapshot_v1_rest_json(example_table_metadata_no_snapshot_v1: Dict[str, Any]) -> Dict[str, Any]: +def example_table_metadata_no_snapshot_v1_rest_json(example_table_metadata_no_snapshot_v1: dict[str, Any]) -> dict[str, Any]: return { "metadata-location": "s3://warehouse/database/table/metadata.json", "metadata": example_table_metadata_no_snapshot_v1, @@ -837,7 +838,7 @@ def test_update_namespace_properties_404(rest_mock: Mocker) -> None: assert "Namespace does not exist" in str(e.value) -def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any]) -> None: +def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any]) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/fokko/tables/table", json=example_table_metadata_with_snapshot_v1_rest_json, @@ -859,7 +860,7 @@ def test_load_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_ def test_load_table_200_loading_mode( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/fokko/tables/table?snapshots=refs", @@ -882,7 +883,7 @@ def test_load_table_200_loading_mode( def test_load_table_honor_access_delegation( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: test_headers_with_remote_signing = {**TEST_HEADERS, "X-Iceberg-Access-Delegation": "remote-signing"} rest_mock.get( @@ -914,7 +915,7 @@ def test_load_table_honor_access_delegation( def test_load_table_from_self_identifier_200( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/table", @@ -1017,7 +1018,7 @@ def test_drop_table_404(rest_mock: Mocker) -> None: def test_create_table_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1047,7 +1048,7 @@ def test_create_table_200( def test_create_table_with_given_location_removes_trailing_slash_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1074,8 +1075,8 @@ def test_create_table_with_given_location_removes_trailing_slash_200( def test_create_staged_table_200( rest_mock: Mocker, table_schema_simple: Schema, - example_table_metadata_with_no_location: Dict[str, Any], - example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any], + example_table_metadata_with_no_location: dict[str, Any], + example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any], ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/fokko/tables", @@ -1163,12 +1164,12 @@ def test_create_table_409(rest_mock: Mocker, table_schema_simple: Schema) -> Non def test_create_table_if_not_exists_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: - def json_callback() -> Callable[[Any, Any], Dict[str, Any]]: + def json_callback() -> Callable[[Any, Any], dict[str, Any]]: call_count = 0 - def callback(request: Any, context: Any) -> Dict[str, Any]: + def callback(request: Any, context: Any) -> dict[str, Any]: nonlocal call_count call_count += 1 @@ -1250,7 +1251,7 @@ def test_create_table_419(rest_mock: Mocker, table_schema_simple: Schema) -> Non def test_register_table_200( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_no_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.post( f"{TEST_URI}v1/namespaces/default/register", @@ -1318,7 +1319,7 @@ def test_delete_table_204(rest_mock: Mocker) -> None: def test_delete_table_from_self_identifier_204( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/table", @@ -1337,7 +1338,7 @@ def test_delete_table_from_self_identifier_204( catalog.drop_table(table.name()) -def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any]) -> None: +def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any]) -> None: rest_mock.post( f"{TEST_URI}v1/tables/rename", json={ @@ -1374,7 +1375,7 @@ def test_rename_table_200(rest_mock: Mocker, example_table_metadata_with_snapsho def test_rename_table_from_self_identifier_200( - rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: Dict[str, Any] + rest_mock: Mocker, example_table_metadata_with_snapshot_v1_rest_json: dict[str, Any] ) -> None: rest_mock.get( f"{TEST_URI}v1/namespaces/pdames/tables/source", @@ -1795,7 +1796,7 @@ def test_catalog_from_parameters_empty_env(rest_mock: Mocker) -> None: def test_table_identifier_in_commit_table_request( - rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_v2: Dict[str, Any] + rest_mock: Mocker, table_schema_simple: Schema, example_table_metadata_v2: dict[str, Any] ) -> None: metadata_location = "s3://some_bucket/metadata.json" rest_mock.post( diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index 00868a5739..22b9883c6f 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -16,8 +16,9 @@ # under the License. import os +from collections.abc import Generator from pathlib import Path -from typing import Any, Generator, cast +from typing import Any, cast import pyarrow as pa import pytest diff --git a/tests/conftest.py b/tests/conftest.py index 706baea38d..85c15d3e0b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,6 +31,7 @@ import string import time import uuid +from collections.abc import Generator from datetime import date, datetime, timezone from pathlib import Path from random import choice, randint @@ -38,9 +39,6 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - Generator, - List, ) import boto3 @@ -103,7 +101,7 @@ from pyiceberg.io.pyarrow import PyArrowFileIO -def pytest_collection_modifyitems(items: List[pytest.Item]) -> None: +def pytest_collection_modifyitems(items: list[pytest.Item]) -> None: for item in items: if not any(item.iter_markers()): item.add_marker("unmarked") @@ -546,7 +544,7 @@ def iceberg_schema_nested_no_ids() -> Schema: @pytest.fixture(scope="session") -def all_avro_types() -> Dict[str, Any]: +def all_avro_types() -> dict[str, Any]: return { "type": "record", "name": "all_avro_types", @@ -650,7 +648,7 @@ def all_avro_types() -> Dict[str, Any]: @pytest.fixture(scope="session") -def example_table_metadata_v1() -> Dict[str, Any]: +def example_table_metadata_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V1 @@ -724,7 +722,7 @@ def example_table_metadata_v1() -> Dict[str, Any]: @pytest.fixture -def example_table_metadata_with_snapshot_v1() -> Dict[str, Any]: +def example_table_metadata_with_snapshot_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_WITH_SNAPSHOT_V1 @@ -777,18 +775,18 @@ def example_table_metadata_with_snapshot_v1() -> Dict[str, Any]: @pytest.fixture -def example_table_metadata_no_snapshot_v1() -> Dict[str, Any]: +def example_table_metadata_no_snapshot_v1() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_NO_SNAPSHOT_V1 @pytest.fixture -def example_table_metadata_v2_with_extensive_snapshots() -> Dict[str, Any]: +def example_table_metadata_v2_with_extensive_snapshots() -> dict[str, Any]: def generate_snapshot( snapshot_id: int, parent_snapshot_id: int | None = None, timestamp_ms: int | None = None, sequence_number: int = 0, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: return { "snapshot-id": snapshot_id, "parent-snapshot-id": parent_snapshot_id, @@ -1116,22 +1114,22 @@ def generate_snapshot( @pytest.fixture -def example_table_metadata_v2() -> Dict[str, Any]: +def example_table_metadata_v2() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V2 @pytest.fixture -def table_metadata_v2_with_fixed_and_decimal_types() -> Dict[str, Any]: +def table_metadata_v2_with_fixed_and_decimal_types() -> dict[str, Any]: return TABLE_METADATA_V2_WITH_FIXED_AND_DECIMAL_TYPES @pytest.fixture -def table_metadata_v2_with_statistics() -> Dict[str, Any]: +def table_metadata_v2_with_statistics() -> dict[str, Any]: return TABLE_METADATA_V2_WITH_STATISTICS @pytest.fixture -def example_table_metadata_v3() -> Dict[str, Any]: +def example_table_metadata_v3() -> dict[str, Any]: return EXAMPLE_TABLE_METADATA_V3 @@ -1487,7 +1485,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: @pytest.fixture(scope="session") -def avro_schema_manifest_file_v1() -> Dict[str, Any]: +def avro_schema_manifest_file_v1() -> dict[str, Any]: return { "type": "record", "name": "manifest_file", @@ -1589,7 +1587,7 @@ def avro_schema_manifest_file_v1() -> Dict[str, Any]: @pytest.fixture(scope="session") -def avro_schema_manifest_file_v2() -> Dict[str, Any]: +def avro_schema_manifest_file_v2() -> dict[str, Any]: return { "type": "record", "name": "manifest_file", @@ -1668,7 +1666,7 @@ def avro_schema_manifest_file_v2() -> Dict[str, Any]: @pytest.fixture(scope="session") -def avro_schema_manifest_entry() -> Dict[str, Any]: +def avro_schema_manifest_entry() -> dict[str, Any]: return { "type": "record", "name": "manifest_entry", @@ -1898,7 +1896,7 @@ def test_partition_spec() -> Schema: @pytest.fixture(scope="session") def generated_manifest_entry_file( - avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec + avro_schema_manifest_entry: dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -1921,7 +1919,7 @@ def generated_manifest_entry_file( @pytest.fixture(scope="session") def generated_manifest_file_file_v1( - avro_schema_manifest_file_v1: Dict[str, Any], generated_manifest_entry_file: str + avro_schema_manifest_file_v1: dict[str, Any], generated_manifest_entry_file: str ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -1939,7 +1937,7 @@ def generated_manifest_file_file_v1( @pytest.fixture(scope="session") def generated_manifest_file_file_v2( - avro_schema_manifest_file_v2: Dict[str, Any], generated_manifest_entry_file: str + avro_schema_manifest_file_v2: dict[str, Any], generated_manifest_entry_file: str ) -> Generator[str, None, None]: from fastavro import parse_schema, writer @@ -2288,7 +2286,7 @@ def table_name() -> str: @pytest.fixture() -def table_list(table_name: str) -> List[str]: +def table_list(table_name: str) -> list[str]: return [f"{table_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2307,7 +2305,7 @@ def gcp_dataset_name() -> str: @pytest.fixture() -def database_list(database_name: str) -> List[str]: +def database_list(database_name: str) -> list[str]: return [f"{database_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2320,7 +2318,7 @@ def hierarchical_namespace_name() -> str: @pytest.fixture() -def hierarchical_namespace_list(hierarchical_namespace_name: str) -> List[str]: +def hierarchical_namespace_list(hierarchical_namespace_name: str) -> list[str]: return [f"{hierarchical_namespace_name}_{idx}" for idx in range(NUM_TABLES)] @@ -2466,7 +2464,7 @@ def warehouse(tmp_path_factory: pytest.TempPathFactory) -> Path: @pytest.fixture -def table_v1(example_table_metadata_v1: Dict[str, Any]) -> Table: +def table_v1(example_table_metadata_v1: dict[str, Any]) -> Table: table_metadata = TableMetadataV1(**example_table_metadata_v1) return Table( identifier=("database", "table"), @@ -2478,7 +2476,7 @@ def table_v1(example_table_metadata_v1: Dict[str, Any]) -> Table: @pytest.fixture -def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: +def table_v2(example_table_metadata_v2: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**example_table_metadata_v2) return Table( identifier=("database", "table"), @@ -2490,7 +2488,7 @@ def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: @pytest.fixture -def table_v3(example_table_metadata_v3: Dict[str, Any]) -> Table: +def table_v3(example_table_metadata_v3: dict[str, Any]) -> Table: table_metadata = TableMetadataV3(**example_table_metadata_v3) return Table( identifier=("database", "table"), @@ -2502,7 +2500,7 @@ def table_v3(example_table_metadata_v3: Dict[str, Any]) -> Table: @pytest.fixture -def table_v2_orc(example_table_metadata_v2: Dict[str, Any]) -> Table: +def table_v2_orc(example_table_metadata_v2: dict[str, Any]) -> Table: import copy metadata_dict = copy.deepcopy(example_table_metadata_v2) @@ -2521,7 +2519,7 @@ def table_v2_orc(example_table_metadata_v2: Dict[str, Any]) -> Table: @pytest.fixture def table_v2_with_fixed_and_decimal_types( - table_metadata_v2_with_fixed_and_decimal_types: Dict[str, Any], + table_metadata_v2_with_fixed_and_decimal_types: dict[str, Any], ) -> Table: table_metadata = TableMetadataV2( **table_metadata_v2_with_fixed_and_decimal_types, @@ -2536,7 +2534,7 @@ def table_v2_with_fixed_and_decimal_types( @pytest.fixture -def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: Dict[str, Any]) -> Table: +def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**example_table_metadata_v2_with_extensive_snapshots) return Table( identifier=("database", "table"), @@ -2548,7 +2546,7 @@ def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_s @pytest.fixture -def table_v2_with_statistics(table_metadata_v2_with_statistics: Dict[str, Any]) -> Table: +def table_v2_with_statistics(table_metadata_v2_with_statistics: dict[str, Any]) -> Table: table_metadata = TableMetadataV2(**table_metadata_v2_with_statistics) return Table( identifier=("database", "table"), @@ -2560,17 +2558,17 @@ def table_v2_with_statistics(table_metadata_v2_with_statistics: Dict[str, Any]) @pytest.fixture -def bound_reference_str() -> BoundReference[str]: +def bound_reference_str() -> BoundReference: return BoundReference(field=NestedField(1, "field", StringType(), required=False), accessor=Accessor(position=0, inner=None)) @pytest.fixture -def bound_reference_binary() -> BoundReference[str]: +def bound_reference_binary() -> BoundReference: return BoundReference(field=NestedField(1, "field", BinaryType(), required=False), accessor=Accessor(position=0, inner=None)) @pytest.fixture -def bound_reference_uuid() -> BoundReference[str]: +def bound_reference_uuid() -> BoundReference: return BoundReference(field=NestedField(1, "field", UUIDType(), required=False), accessor=Accessor(position=0, inner=None)) diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 07888dd41e..5be3e92be8 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -685,7 +685,7 @@ def data_file_nan() -> DataFile: def test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_file_nan: Schema, data_file_nan: DataFile) -> None: - operators: tuple[type[LiteralPredicate[Any]], ...] = (LessThan, LessThanOrEqual) + operators: tuple[type[LiteralPredicate], ...] = (LessThan, LessThanOrEqual) for operator in operators: should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) assert not should_read, "Should not match: all nan column doesn't contain number" @@ -714,7 +714,7 @@ def test_inclusive_metrics_evaluator_less_than_and_less_than_equal(schema_data_f def test_inclusive_metrics_evaluator_greater_than_and_greater_than_equal( schema_data_file_nan: Schema, data_file_nan: DataFile ) -> None: - operators: tuple[type[LiteralPredicate[Any]], ...] = (GreaterThan, GreaterThanOrEqual) + operators: tuple[type[LiteralPredicate], ...] = (GreaterThan, GreaterThanOrEqual) for operator in operators: should_read = _InclusiveMetricsEvaluator(schema_data_file_nan, operator("all_nan", 1)).eval(data_file_nan) assert not should_read, "Should not match: all nan column doesn't contain number" diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index dbee2ca045..157c1adaf1 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -19,7 +19,6 @@ import pickle import uuid from decimal import Decimal -from typing import Any import pytest from typing_extensions import assert_type @@ -65,7 +64,7 @@ from pyiceberg.expressions.literals import Literal, literal from pyiceberg.expressions.visitors import _from_byte_buffer from pyiceberg.schema import Accessor, Schema -from pyiceberg.typedef import L, Record +from pyiceberg.typedef import Record from pyiceberg.types import ( DecimalType, DoubleType, @@ -502,8 +501,8 @@ def test_less_than_or_equal_invert() -> None: LessThanOrEqual(Reference("foo"), "hello"), ], ) -def test_bind(pred: UnboundPredicate[Any], table_schema_simple: Schema) -> None: - assert pred.bind(table_schema_simple, case_sensitive=True).term.field == table_schema_simple.find_field( # type: ignore +def test_bind(pred: UnboundPredicate, table_schema_simple: Schema) -> None: + assert pred.bind(table_schema_simple, case_sensitive=True).term.field == table_schema_simple.find_field( pred.term.name, # type: ignore case_sensitive=True, ) @@ -522,8 +521,8 @@ def test_bind(pred: UnboundPredicate[Any], table_schema_simple: Schema) -> None: LessThanOrEqual(Reference("Bar"), 5), ], ) -def test_bind_case_insensitive(pred: UnboundPredicate[Any], table_schema_simple: Schema) -> None: - assert pred.bind(table_schema_simple, case_sensitive=False).term.field == table_schema_simple.find_field( # type: ignore +def test_bind_case_insensitive(pred: UnboundPredicate, table_schema_simple: Schema) -> None: + assert pred.bind(table_schema_simple, case_sensitive=False).term.field == table_schema_simple.find_field( pred.term.name, # type: ignore case_sensitive=False, ) @@ -683,7 +682,7 @@ def accessor() -> Accessor: @pytest.fixture -def term(field: NestedField, accessor: Accessor) -> BoundReference[Any]: +def term(field: NestedField, accessor: Accessor) -> BoundReference: return BoundReference( field=field, accessor=accessor, @@ -726,6 +725,14 @@ def test_and() -> None: null & "abc" +def test_and_serialization() -> None: + expr = And(EqualTo("x", 1), GreaterThan("y", 2)) + json_repr = '{"type":"and","left":{"term":"x","type":"eq","value":1},"right":{"term":"y","type":"gt","value":2}}' + + assert expr.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == expr + + def test_or() -> None: null = IsNull(Reference("a")) nan = IsNaN(Reference("b")) @@ -747,11 +754,10 @@ def test_or_serialization() -> None: left = EqualTo("a", 10) right = EqualTo("b", 20) or_ = Or(left, right) + json_repr = '{"type":"or","left":{"term":"a","type":"eq","value":10},"right":{"term":"b","type":"eq","value":20}}' - assert ( - or_.model_dump_json() - == '{"type":"or","left":{"term":"a","type":"eq","value":10},"right":{"term":"b","type":"eq","value":20}}' - ) + assert or_.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == or_ def test_not() -> None: @@ -771,7 +777,8 @@ def test_not_json_serialization_and_deserialization() -> None: def test_always_true() -> None: always_true = AlwaysTrue() - assert always_true.model_dump_json() == '"true"' + assert always_true.model_dump_json() == "true" + assert BooleanExpression.model_validate_json("true") == always_true assert str(always_true) == "AlwaysTrue()" assert repr(always_true) == "AlwaysTrue()" assert always_true == eval(repr(always_true)) @@ -780,7 +787,8 @@ def test_always_true() -> None: def test_always_false() -> None: always_false = AlwaysFalse() - assert always_false.model_dump_json() == '"false"' + assert always_false.model_dump_json() == "false" + assert BooleanExpression.model_validate_json("false") == always_false assert str(always_false) == "AlwaysFalse()" assert repr(always_false) == "AlwaysFalse()" assert always_false == eval(repr(always_false)) @@ -794,14 +802,14 @@ def test_bound_reference_field_property() -> None: assert bound_ref.field == NestedField(field_id=1, name="foo", field_type=StringType(), required=False) -def test_bound_is_null(term: BoundReference[Any]) -> None: +def test_bound_is_null(term: BoundReference) -> None: bound_is_null = BoundIsNull(term) assert str(bound_is_null) == f"BoundIsNull(term={str(term)})" assert repr(bound_is_null) == f"BoundIsNull(term={repr(term)})" assert bound_is_null == eval(repr(bound_is_null)) -def test_bound_is_not_null(term: BoundReference[Any]) -> None: +def test_bound_is_not_null(term: BoundReference) -> None: bound_not_null = BoundNotNull(term) assert str(bound_not_null) == f"BoundNotNull(term={str(term)})" assert repr(bound_not_null) == f"BoundNotNull(term={repr(term)})" @@ -815,6 +823,10 @@ def test_is_null() -> None: assert repr(is_null) == f"IsNull(term={repr(ref)})" assert is_null == eval(repr(is_null)) assert is_null == pickle.loads(pickle.dumps(is_null)) + pred = IsNull(term="foo") + json_repr = '{"term":"foo","type":"is-null"}' + assert pred.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == pred def test_not_null() -> None: @@ -824,21 +836,15 @@ def test_not_null() -> None: assert repr(non_null) == f"NotNull(term={repr(ref)})" assert non_null == eval(repr(non_null)) assert non_null == pickle.loads(pickle.dumps(non_null)) - - -def test_serialize_is_null() -> None: - pred = IsNull(term="foo") - assert pred.model_dump_json() == '{"term":"foo","type":"is-null"}' - - -def test_serialize_not_null() -> None: pred = NotNull(term="foo") - assert pred.model_dump_json() == '{"term":"foo","type":"not-null"}' + json_repr = '{"term":"foo","type":"not-null"}' + assert pred.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == pred def test_bound_is_nan(accessor: Accessor) -> None: # We need a FloatType here - term = BoundReference[float]( + term = BoundReference( field=NestedField(field_id=1, name="foo", field_type=FloatType(), required=False), accessor=accessor, ) @@ -851,7 +857,7 @@ def test_bound_is_nan(accessor: Accessor) -> None: def test_bound_is_not_nan(accessor: Accessor) -> None: # We need a FloatType here - term = BoundReference[float]( + term = BoundReference( field=NestedField(field_id=1, name="foo", field_type=FloatType(), required=False), accessor=accessor, ) @@ -869,6 +875,9 @@ def test_is_nan() -> None: assert repr(is_nan) == f"IsNaN(term={repr(ref)})" assert is_nan == eval(repr(is_nan)) assert is_nan == pickle.loads(pickle.dumps(is_nan)) + json_repr = '{"term":"a","type":"is-nan"}' + assert is_nan.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == is_nan def test_not_nan() -> None: @@ -878,9 +887,12 @@ def test_not_nan() -> None: assert repr(not_nan) == f"NotNaN(term={repr(ref)})" assert not_nan == eval(repr(not_nan)) assert not_nan == pickle.loads(pickle.dumps(not_nan)) + json_repr = '{"term":"a","type":"not-nan"}' + assert not_nan.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == not_nan -def test_bound_in(term: BoundReference[Any]) -> None: +def test_bound_in(term: BoundReference) -> None: bound_in = BoundIn(term, {literal("a"), literal("b"), literal("c")}) assert str(bound_in) == f"BoundIn({str(term)}, {{a, b, c}})" assert repr(bound_in) == f"BoundIn({repr(term)}, {{literal('a'), literal('b'), literal('c')}})" @@ -888,7 +900,7 @@ def test_bound_in(term: BoundReference[Any]) -> None: assert bound_in == pickle.loads(pickle.dumps(bound_in)) -def test_bound_not_in(term: BoundReference[Any]) -> None: +def test_bound_not_in(term: BoundReference) -> None: bound_not_in = BoundNotIn(term, {literal("a"), literal("b"), literal("c")}) assert str(bound_not_in) == f"BoundNotIn({str(term)}, {{a, b, c}})" assert repr(bound_not_in) == f"BoundNotIn({repr(term)}, {{literal('a'), literal('b'), literal('c')}})" @@ -898,7 +910,10 @@ def test_bound_not_in(term: BoundReference[Any]) -> None: def test_in() -> None: ref = Reference("a") - unbound_in = In(ref, {"a", "b", "c"}) + unbound_in = In(ref, ["a", "b", "c"]) + json_repr = unbound_in.model_dump_json() + assert json_repr.startswith('{"term":"a","type":"in","values":[') + assert BooleanExpression.model_validate_json(json_repr) == unbound_in assert str(unbound_in) == f"In({str(ref)}, {{a, b, c}})" assert repr(unbound_in) == f"In({repr(ref)}, {{literal('a'), literal('b'), literal('c')}})" assert unbound_in == eval(repr(unbound_in)) @@ -907,24 +922,17 @@ def test_in() -> None: def test_not_in() -> None: ref = Reference("a") - not_in = NotIn(ref, {"a", "b", "c"}) + not_in = NotIn(ref, ["a", "b", "c"]) + json_repr = not_in.model_dump_json() + assert not_in.model_dump_json().startswith('{"term":"a","type":"not-in","values":') + assert BooleanExpression.model_validate_json(json_repr) == not_in assert str(not_in) == f"NotIn({str(ref)}, {{a, b, c}})" assert repr(not_in) == f"NotIn({repr(ref)}, {{literal('a'), literal('b'), literal('c')}})" assert not_in == eval(repr(not_in)) assert not_in == pickle.loads(pickle.dumps(not_in)) -def test_serialize_in() -> None: - pred = In(term="foo", literals=[1, 2, 3]) - assert pred.model_dump_json() == '{"term":"foo","type":"in","items":[1,2,3]}' - - -def test_serialize_not_in() -> None: - pred = NotIn(term="foo", literals=[1, 2, 3]) - assert pred.model_dump_json() == '{"term":"foo","type":"not-in","items":[1,2,3]}' - - -def test_bound_equal_to(term: BoundReference[Any]) -> None: +def test_bound_equal_to(term: BoundReference) -> None: bound_equal_to = BoundEqualTo(term, literal("a")) assert str(bound_equal_to) == f"BoundEqualTo(term={str(term)}, literal=literal('a'))" assert repr(bound_equal_to) == f"BoundEqualTo(term={repr(term)}, literal=literal('a'))" @@ -932,7 +940,7 @@ def test_bound_equal_to(term: BoundReference[Any]) -> None: assert bound_equal_to == pickle.loads(pickle.dumps(bound_equal_to)) -def test_bound_not_equal_to(term: BoundReference[Any]) -> None: +def test_bound_not_equal_to(term: BoundReference) -> None: bound_not_equal_to = BoundNotEqualTo(term, literal("a")) assert str(bound_not_equal_to) == f"BoundNotEqualTo(term={str(term)}, literal=literal('a'))" assert repr(bound_not_equal_to) == f"BoundNotEqualTo(term={repr(term)}, literal=literal('a'))" @@ -940,7 +948,7 @@ def test_bound_not_equal_to(term: BoundReference[Any]) -> None: assert bound_not_equal_to == pickle.loads(pickle.dumps(bound_not_equal_to)) -def test_bound_greater_than_or_equal_to(term: BoundReference[Any]) -> None: +def test_bound_greater_than_or_equal_to(term: BoundReference) -> None: bound_greater_than_or_equal_to = BoundGreaterThanOrEqual(term, literal("a")) assert str(bound_greater_than_or_equal_to) == f"BoundGreaterThanOrEqual(term={str(term)}, literal=literal('a'))" assert repr(bound_greater_than_or_equal_to) == f"BoundGreaterThanOrEqual(term={repr(term)}, literal=literal('a'))" @@ -948,7 +956,7 @@ def test_bound_greater_than_or_equal_to(term: BoundReference[Any]) -> None: assert bound_greater_than_or_equal_to == pickle.loads(pickle.dumps(bound_greater_than_or_equal_to)) -def test_bound_greater_than(term: BoundReference[Any]) -> None: +def test_bound_greater_than(term: BoundReference) -> None: bound_greater_than = BoundGreaterThan(term, literal("a")) assert str(bound_greater_than) == f"BoundGreaterThan(term={str(term)}, literal=literal('a'))" assert repr(bound_greater_than) == f"BoundGreaterThan(term={repr(term)}, literal=literal('a'))" @@ -956,7 +964,7 @@ def test_bound_greater_than(term: BoundReference[Any]) -> None: assert bound_greater_than == pickle.loads(pickle.dumps(bound_greater_than)) -def test_bound_less_than(term: BoundReference[Any]) -> None: +def test_bound_less_than(term: BoundReference) -> None: bound_less_than = BoundLessThan(term, literal("a")) assert str(bound_less_than) == f"BoundLessThan(term={str(term)}, literal=literal('a'))" assert repr(bound_less_than) == f"BoundLessThan(term={repr(term)}, literal=literal('a'))" @@ -964,7 +972,7 @@ def test_bound_less_than(term: BoundReference[Any]) -> None: assert bound_less_than == pickle.loads(pickle.dumps(bound_less_than)) -def test_bound_less_than_or_equal(term: BoundReference[Any]) -> None: +def test_bound_less_than_or_equal(term: BoundReference) -> None: bound_less_than_or_equal = BoundLessThanOrEqual(term, literal("a")) assert str(bound_less_than_or_equal) == f"BoundLessThanOrEqual(term={str(term)}, literal=literal('a'))" assert repr(bound_less_than_or_equal) == f"BoundLessThanOrEqual(term={repr(term)}, literal=literal('a'))" @@ -974,7 +982,9 @@ def test_bound_less_than_or_equal(term: BoundReference[Any]) -> None: def test_equal_to() -> None: equal_to = EqualTo(Reference("a"), literal("a")) - assert equal_to.model_dump_json() == '{"term":"a","type":"eq","value":"a"}' + json_repr = '{"term":"a","type":"eq","value":"a"}' + assert equal_to.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == equal_to assert str(equal_to) == "EqualTo(term=Reference(name='a'), literal=literal('a'))" assert repr(equal_to) == "EqualTo(term=Reference(name='a'), literal=literal('a'))" assert equal_to == eval(repr(equal_to)) @@ -983,7 +993,9 @@ def test_equal_to() -> None: def test_not_equal_to() -> None: not_equal_to = NotEqualTo(Reference("a"), literal("a")) - assert not_equal_to.model_dump_json() == '{"term":"a","type":"not-eq","value":"a"}' + json_repr = '{"term":"a","type":"not-eq","value":"a"}' + assert not_equal_to.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == not_equal_to assert str(not_equal_to) == "NotEqualTo(term=Reference(name='a'), literal=literal('a'))" assert repr(not_equal_to) == "NotEqualTo(term=Reference(name='a'), literal=literal('a'))" assert not_equal_to == eval(repr(not_equal_to)) @@ -992,7 +1004,9 @@ def test_not_equal_to() -> None: def test_greater_than_or_equal_to() -> None: greater_than_or_equal_to = GreaterThanOrEqual(Reference("a"), literal("a")) - assert greater_than_or_equal_to.model_dump_json() == '{"term":"a","type":"gt-eq","value":"a"}' + json_repr = '{"term":"a","type":"gt-eq","value":"a"}' + assert greater_than_or_equal_to.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == greater_than_or_equal_to assert str(greater_than_or_equal_to) == "GreaterThanOrEqual(term=Reference(name='a'), literal=literal('a'))" assert repr(greater_than_or_equal_to) == "GreaterThanOrEqual(term=Reference(name='a'), literal=literal('a'))" assert greater_than_or_equal_to == eval(repr(greater_than_or_equal_to)) @@ -1001,7 +1015,9 @@ def test_greater_than_or_equal_to() -> None: def test_greater_than() -> None: greater_than = GreaterThan(Reference("a"), literal("a")) - assert greater_than.model_dump_json() == '{"term":"a","type":"gt","value":"a"}' + json_repr = '{"term":"a","type":"gt","value":"a"}' + assert greater_than.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == greater_than assert str(greater_than) == "GreaterThan(term=Reference(name='a'), literal=literal('a'))" assert repr(greater_than) == "GreaterThan(term=Reference(name='a'), literal=literal('a'))" assert greater_than == eval(repr(greater_than)) @@ -1010,7 +1026,9 @@ def test_greater_than() -> None: def test_less_than() -> None: less_than = LessThan(Reference("a"), literal("a")) - assert less_than.model_dump_json() == '{"term":"a","type":"lt","value":"a"}' + json_repr = '{"term":"a","type":"lt","value":"a"}' + assert less_than.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == less_than assert str(less_than) == "LessThan(term=Reference(name='a'), literal=literal('a'))" assert repr(less_than) == "LessThan(term=Reference(name='a'), literal=literal('a'))" assert less_than == eval(repr(less_than)) @@ -1019,7 +1037,9 @@ def test_less_than() -> None: def test_less_than_or_equal() -> None: less_than_or_equal = LessThanOrEqual(Reference("a"), literal("a")) - assert less_than_or_equal.model_dump_json() == '{"term":"a","type":"lt-eq","value":"a"}' + json_repr = '{"term":"a","type":"lt-eq","value":"a"}' + assert less_than_or_equal.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == less_than_or_equal assert str(less_than_or_equal) == "LessThanOrEqual(term=Reference(name='a'), literal=literal('a'))" assert repr(less_than_or_equal) == "LessThanOrEqual(term=Reference(name='a'), literal=literal('a'))" assert less_than_or_equal == eval(repr(less_than_or_equal)) @@ -1028,12 +1048,16 @@ def test_less_than_or_equal() -> None: def test_starts_with() -> None: starts_with = StartsWith(Reference("a"), literal("a")) - assert starts_with.model_dump_json() == '{"term":"a","type":"starts-with","value":"a"}' + json_repr = '{"term":"a","type":"starts-with","value":"a"}' + assert starts_with.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == starts_with def test_not_starts_with() -> None: not_starts_with = NotStartsWith(Reference("a"), literal("a")) - assert not_starts_with.model_dump_json() == '{"term":"a","type":"not-starts-with","value":"a"}' + json_repr = '{"term":"a","type":"not-starts-with","value":"a"}' + assert not_starts_with.model_dump_json() == json_repr + assert BooleanExpression.model_validate_json(json_repr) == not_starts_with def test_bound_reference_eval(table_schema_simple: Schema) -> None: @@ -1092,37 +1116,37 @@ def below_int_min() -> Literal[int]: def test_above_int_bounds_equal_to(int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int]) -> None: - assert EqualTo[int]("a", above_int_max).bind(int_schema) is AlwaysFalse() - assert EqualTo[int]("a", below_int_min).bind(int_schema) is AlwaysFalse() + assert EqualTo("a", above_int_max).bind(int_schema) is AlwaysFalse() + assert EqualTo("a", below_int_min).bind(int_schema) is AlwaysFalse() def test_above_int_bounds_not_equal_to(int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int]) -> None: - assert NotEqualTo[int]("a", above_int_max).bind(int_schema) is AlwaysTrue() - assert NotEqualTo[int]("a", below_int_min).bind(int_schema) is AlwaysTrue() + assert NotEqualTo("a", above_int_max).bind(int_schema) is AlwaysTrue() + assert NotEqualTo("a", below_int_min).bind(int_schema) is AlwaysTrue() def test_above_int_bounds_less_than(int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int]) -> None: - assert LessThan[int]("a", above_int_max).bind(int_schema) is AlwaysTrue() - assert LessThan[int]("a", below_int_min).bind(int_schema) is AlwaysFalse() + assert LessThan("a", above_int_max).bind(int_schema) is AlwaysTrue() + assert LessThan("a", below_int_min).bind(int_schema) is AlwaysFalse() def test_above_int_bounds_less_than_or_equal( int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int] ) -> None: - assert LessThanOrEqual[int]("a", above_int_max).bind(int_schema) is AlwaysTrue() - assert LessThanOrEqual[int]("a", below_int_min).bind(int_schema) is AlwaysFalse() + assert LessThanOrEqual("a", above_int_max).bind(int_schema) is AlwaysTrue() + assert LessThanOrEqual("a", below_int_min).bind(int_schema) is AlwaysFalse() def test_above_int_bounds_greater_than(int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int]) -> None: - assert GreaterThan[int]("a", above_int_max).bind(int_schema) is AlwaysFalse() - assert GreaterThan[int]("a", below_int_min).bind(int_schema) is AlwaysTrue() + assert GreaterThan("a", above_int_max).bind(int_schema) is AlwaysFalse() + assert GreaterThan("a", below_int_min).bind(int_schema) is AlwaysTrue() def test_above_int_bounds_greater_than_or_equal( int_schema: Schema, above_int_max: Literal[int], below_int_min: Literal[int] ) -> None: - assert GreaterThanOrEqual[int]("a", above_int_max).bind(int_schema) is AlwaysFalse() - assert GreaterThanOrEqual[int]("a", below_int_min).bind(int_schema) is AlwaysTrue() + assert GreaterThanOrEqual("a", above_int_max).bind(int_schema) is AlwaysFalse() + assert GreaterThanOrEqual("a", below_int_min).bind(int_schema) is AlwaysTrue() @pytest.fixture @@ -1143,43 +1167,43 @@ def below_float_min() -> Literal[float]: def test_above_float_bounds_equal_to( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert EqualTo[float]("a", above_float_max).bind(float_schema) is AlwaysFalse() - assert EqualTo[float]("a", below_float_min).bind(float_schema) is AlwaysFalse() + assert EqualTo("a", above_float_max).bind(float_schema) is AlwaysFalse() + assert EqualTo("a", below_float_min).bind(float_schema) is AlwaysFalse() def test_above_float_bounds_not_equal_to( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert NotEqualTo[float]("a", above_float_max).bind(float_schema) is AlwaysTrue() - assert NotEqualTo[float]("a", below_float_min).bind(float_schema) is AlwaysTrue() + assert NotEqualTo("a", above_float_max).bind(float_schema) is AlwaysTrue() + assert NotEqualTo("a", below_float_min).bind(float_schema) is AlwaysTrue() def test_above_float_bounds_less_than( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert LessThan[float]("a", above_float_max).bind(float_schema) is AlwaysTrue() - assert LessThan[float]("a", below_float_min).bind(float_schema) is AlwaysFalse() + assert LessThan("a", above_float_max).bind(float_schema) is AlwaysTrue() + assert LessThan("a", below_float_min).bind(float_schema) is AlwaysFalse() def test_above_float_bounds_less_than_or_equal( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert LessThanOrEqual[float]("a", above_float_max).bind(float_schema) is AlwaysTrue() - assert LessThanOrEqual[float]("a", below_float_min).bind(float_schema) is AlwaysFalse() + assert LessThanOrEqual("a", above_float_max).bind(float_schema) is AlwaysTrue() + assert LessThanOrEqual("a", below_float_min).bind(float_schema) is AlwaysFalse() def test_above_float_bounds_greater_than( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert GreaterThan[float]("a", above_float_max).bind(float_schema) is AlwaysFalse() - assert GreaterThan[float]("a", below_float_min).bind(float_schema) is AlwaysTrue() + assert GreaterThan("a", above_float_max).bind(float_schema) is AlwaysFalse() + assert GreaterThan("a", below_float_min).bind(float_schema) is AlwaysTrue() def test_above_float_bounds_greater_than_or_equal( float_schema: Schema, above_float_max: Literal[float], below_float_min: Literal[float] ) -> None: - assert GreaterThanOrEqual[float]("a", above_float_max).bind(float_schema) is AlwaysFalse() - assert GreaterThanOrEqual[float]("a", below_float_min).bind(float_schema) is AlwaysTrue() + assert GreaterThanOrEqual("a", above_float_max).bind(float_schema) is AlwaysFalse() + assert GreaterThanOrEqual("a", below_float_min).bind(float_schema) is AlwaysTrue() @pytest.fixture @@ -1198,40 +1222,40 @@ def below_long_min() -> Literal[float]: def test_above_long_bounds_equal_to(long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int]) -> None: - assert EqualTo[int]("a", above_long_max).bind(long_schema) is AlwaysFalse() - assert EqualTo[int]("a", below_long_min).bind(long_schema) is AlwaysFalse() + assert EqualTo("a", above_long_max).bind(long_schema) is AlwaysFalse() + assert EqualTo("a", below_long_min).bind(long_schema) is AlwaysFalse() def test_above_long_bounds_not_equal_to(long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int]) -> None: - assert NotEqualTo[int]("a", above_long_max).bind(long_schema) is AlwaysTrue() - assert NotEqualTo[int]("a", below_long_min).bind(long_schema) is AlwaysTrue() + assert NotEqualTo("a", above_long_max).bind(long_schema) is AlwaysTrue() + assert NotEqualTo("a", below_long_min).bind(long_schema) is AlwaysTrue() def test_above_long_bounds_less_than(long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int]) -> None: - assert LessThan[int]("a", above_long_max).bind(long_schema) is AlwaysTrue() - assert LessThan[int]("a", below_long_min).bind(long_schema) is AlwaysFalse() + assert LessThan("a", above_long_max).bind(long_schema) is AlwaysTrue() + assert LessThan("a", below_long_min).bind(long_schema) is AlwaysFalse() def test_above_long_bounds_less_than_or_equal( long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int] ) -> None: - assert LessThanOrEqual[int]("a", above_long_max).bind(long_schema) is AlwaysTrue() - assert LessThanOrEqual[int]("a", below_long_min).bind(long_schema) is AlwaysFalse() + assert LessThanOrEqual("a", above_long_max).bind(long_schema) is AlwaysTrue() + assert LessThanOrEqual("a", below_long_min).bind(long_schema) is AlwaysFalse() def test_above_long_bounds_greater_than(long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int]) -> None: - assert GreaterThan[int]("a", above_long_max).bind(long_schema) is AlwaysFalse() - assert GreaterThan[int]("a", below_long_min).bind(long_schema) is AlwaysTrue() + assert GreaterThan("a", above_long_max).bind(long_schema) is AlwaysFalse() + assert GreaterThan("a", below_long_min).bind(long_schema) is AlwaysTrue() def test_above_long_bounds_greater_than_or_equal( long_schema: Schema, above_long_max: Literal[int], below_long_min: Literal[int] ) -> None: - assert GreaterThanOrEqual[int]("a", above_long_max).bind(long_schema) is AlwaysFalse() - assert GreaterThanOrEqual[int]("a", below_long_min).bind(long_schema) is AlwaysTrue() + assert GreaterThanOrEqual("a", above_long_max).bind(long_schema) is AlwaysFalse() + assert GreaterThanOrEqual("a", below_long_min).bind(long_schema) is AlwaysTrue() -def test_eq_bound_expression(bound_reference_str: BoundReference[str]) -> None: +def test_eq_bound_expression(bound_reference_str: BoundReference) -> None: assert BoundEqualTo(term=bound_reference_str, literal=literal("a")) != BoundGreaterThanOrEqual( term=bound_reference_str, literal=literal("a") ) @@ -1275,14 +1299,14 @@ def test_bind_ambiguous_name() -> None: # |__/ |__/ -def _assert_literal_predicate_type(expr: LiteralPredicate[L]) -> None: - assert_type(expr, LiteralPredicate[L]) +def _assert_literal_predicate_type(expr: LiteralPredicate) -> None: + assert_type(expr, LiteralPredicate) _assert_literal_predicate_type(EqualTo("a", "b")) _assert_literal_predicate_type(In("a", ("a", "b", "c"))) _assert_literal_predicate_type(In("a", (1, 2, 3))) _assert_literal_predicate_type(NotIn("a", ("a", "b", "c"))) -assert_type(In("a", ("a", "b", "c")), In[str]) -assert_type(In("a", (1, 2, 3)), In[int]) -assert_type(NotIn("a", ("a", "b", "c")), NotIn[str]) +assert_type(In("a", ("a", "b", "c")), In) +assert_type(In("a", (1, 2, 3)), In) +assert_type(NotIn("a", ("a", "b", "c")), NotIn) diff --git a/tests/expressions/test_literals.py b/tests/expressions/test_literals.py index 2137681e79..c3ace5d368 100644 --- a/tests/expressions/test_literals.py +++ b/tests/expressions/test_literals.py @@ -21,9 +21,6 @@ from decimal import Decimal from typing import ( Any, - List, - Set, - Type, ) import pytest @@ -95,14 +92,14 @@ def test_literal_from_nan_error() -> None: BinaryLiteral, ], ) -def test_literal_classes_with_none_type_error(literal_class: Type[PrimitiveType]) -> None: +def test_literal_classes_with_none_type_error(literal_class: type[PrimitiveType]) -> None: with pytest.raises(TypeError) as e: literal_class(None) assert "Invalid literal value: None" in str(e.value) @pytest.mark.parametrize("literal_class", [FloatLiteral, DoubleLiteral]) -def test_literal_classes_with_nan_value_error(literal_class: Type[PrimitiveType]) -> None: +def test_literal_classes_with_nan_value_error(literal_class: type[PrimitiveType]) -> None: with pytest.raises(ValueError) as e: literal_class(float("nan")) assert "Cannot create expression literal from NaN." in str(e.value) @@ -824,7 +821,7 @@ def test_invalid_binary_conversions() -> None: ) -def assert_invalid_conversions(lit: Literal[Any], types: List[PrimitiveType]) -> None: +def assert_invalid_conversions(lit: Literal[Any], types: list[PrimitiveType]) -> None: for type_var in types: with pytest.raises(TypeError): _ = lit.to(type_var) @@ -958,4 +955,4 @@ def test_to_json() -> None: assert_type(literal(123.4), Literal[float]) assert_type(literal(bytes([0x01, 0x02, 0x03])), Literal[bytes]) assert_type(literal(Decimal("19.25")), Literal[Decimal]) -assert_type({literal(1), literal(2), literal(3)}, Set[Literal[int]]) +assert_type({literal(1), literal(2), literal(3)}, set[Literal[int]]) diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py index d0b6ab5ab4..798e9f641e 100644 --- a/tests/expressions/test_visitors.py +++ b/tests/expressions/test_visitors.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=redefined-outer-name -from typing import Any, List, Set +from typing import Any import pytest @@ -91,7 +91,7 @@ ) -class ExampleVisitor(BooleanExpressionVisitor[List[str]]): +class ExampleVisitor(BooleanExpressionVisitor[list[str]]): """A test implementation of a BooleanExpressionVisitor As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given expression is @@ -99,119 +99,119 @@ class ExampleVisitor(BooleanExpressionVisitor[List[str]]): """ def __init__(self) -> None: - self.visit_history: List[str] = [] + self.visit_history: list[str] = [] - def visit_true(self) -> List[str]: + def visit_true(self) -> list[str]: self.visit_history.append("TRUE") return self.visit_history - def visit_false(self) -> List[str]: + def visit_false(self) -> list[str]: self.visit_history.append("FALSE") return self.visit_history - def visit_not(self, child_result: List[str]) -> List[str]: + def visit_not(self, child_result: list[str]) -> list[str]: self.visit_history.append("NOT") return self.visit_history - def visit_and(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_and(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("AND") return self.visit_history - def visit_or(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_or(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("OR") return self.visit_history - def visit_unbound_predicate(self, predicate: UnboundPredicate[Any]) -> List[str]: + def visit_unbound_predicate(self, predicate: UnboundPredicate) -> list[str]: self.visit_history.append(str(predicate.__class__.__name__).upper()) return self.visit_history - def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> List[str]: + def visit_bound_predicate(self, predicate: BoundPredicate) -> list[str]: self.visit_history.append(str(predicate.__class__.__name__).upper()) return self.visit_history -class FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[List[str]]): +class FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[list[str]]): """A test implementation of a BoundBooleanExpressionVisitor As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given bound expression is visited in an expected order by the `visit` method. """ def __init__(self) -> None: - self.visit_history: List[str] = [] + self.visit_history: list[str] = [] - def visit_in(self, term: BoundTerm[Any], literals: Set[Any]) -> List[str]: + def visit_in(self, term: BoundTerm, literals: set[Any]) -> list[str]: self.visit_history.append("IN") return self.visit_history - def visit_not_in(self, term: BoundTerm[Any], literals: Set[Any]) -> List[str]: + def visit_not_in(self, term: BoundTerm, literals: set[Any]) -> list[str]: self.visit_history.append("NOT_IN") return self.visit_history - def visit_is_nan(self, term: BoundTerm[Any]) -> List[str]: + def visit_is_nan(self, term: BoundTerm) -> list[str]: self.visit_history.append("IS_NAN") return self.visit_history - def visit_not_nan(self, term: BoundTerm[Any]) -> List[str]: + def visit_not_nan(self, term: BoundTerm) -> list[str]: self.visit_history.append("NOT_NAN") return self.visit_history - def visit_is_null(self, term: BoundTerm[Any]) -> List[str]: + def visit_is_null(self, term: BoundTerm) -> list[str]: self.visit_history.append("IS_NULL") return self.visit_history - def visit_not_null(self, term: BoundTerm[Any]) -> List[str]: + def visit_not_null(self, term: BoundTerm) -> list[str]: self.visit_history.append("NOT_NULL") return self.visit_history - def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_equal(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("EQUAL") return self.visit_history - def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_not_equal(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("NOT_EQUAL") return self.visit_history - def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_greater_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("GREATER_THAN_OR_EQUAL") return self.visit_history - def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_greater_than(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("GREATER_THAN") return self.visit_history - def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_less_than(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("LESS_THAN") return self.visit_history - def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: # pylint: disable=redefined-outer-name + def visit_less_than_or_equal(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: # pylint: disable=redefined-outer-name self.visit_history.append("LESS_THAN_OR_EQUAL") return self.visit_history - def visit_true(self) -> List[str]: + def visit_true(self) -> list[str]: self.visit_history.append("TRUE") return self.visit_history - def visit_false(self) -> List[str]: + def visit_false(self) -> list[str]: self.visit_history.append("FALSE") return self.visit_history - def visit_not(self, child_result: List[str]) -> List[str]: + def visit_not(self, child_result: list[str]) -> list[str]: self.visit_history.append("NOT") return self.visit_history - def visit_and(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_and(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("AND") return self.visit_history - def visit_or(self, left_result: List[str], right_result: List[str]) -> List[str]: + def visit_or(self, left_result: list[str], right_result: list[str]) -> list[str]: self.visit_history.append("OR") return self.visit_history - def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: + def visit_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: self.visit_history.append("STARTS_WITH") return self.visit_history - def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> List[str]: + def visit_not_starts_with(self, term: BoundTerm, literal: Literal[Any]) -> list[str]: self.visit_history.append("NOT_STARTS_WITH") return self.visit_history @@ -253,7 +253,7 @@ def test_boolean_expression_visit_raise_not_implemented_error() -> None: def test_bind_visitor_already_bound(table_schema_simple: Schema) -> None: - bound = BoundEqualTo[str]( + bound = BoundEqualTo( term=BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)), literal=literal("hello"), ) @@ -315,7 +315,7 @@ def test_always_false_or_always_true_expression_binding(table_schema_simple: Sch ), {literal("foo"), literal("bar")}, ), - BoundIn[int]( + BoundIn( BoundReference( field=NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), accessor=Accessor(position=1, inner=None), @@ -345,7 +345,7 @@ def test_always_false_or_always_true_expression_binding(table_schema_simple: Sch {literal("bar"), literal("baz")}, ), And( - BoundEqualTo[int]( + BoundEqualTo( BoundReference( field=NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), accessor=Accessor(position=1, inner=None), @@ -365,7 +365,7 @@ def test_always_false_or_always_true_expression_binding(table_schema_simple: Sch ], ) def test_and_expression_binding( - unbound_and_expression: UnboundPredicate[Any], expected_bound_expression: BoundPredicate[Any], table_schema_simple: Schema + unbound_and_expression: UnboundPredicate, expected_bound_expression: BoundPredicate, table_schema_simple: Schema ) -> None: """Test that visiting an unbound AND expression with a bind-visitor returns the expected bound expression""" bound_expression = visit(unbound_and_expression, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) @@ -388,7 +388,7 @@ def test_and_expression_binding( ), {literal("foo"), literal("bar")}, ), - BoundIn[int]( + BoundIn( BoundReference( field=NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), accessor=Accessor(position=1, inner=None), @@ -459,7 +459,7 @@ def test_and_expression_binding( ], ) def test_or_expression_binding( - unbound_or_expression: UnboundPredicate[Any], expected_bound_expression: BoundPredicate[Any], table_schema_simple: Schema + unbound_or_expression: UnboundPredicate, expected_bound_expression: BoundPredicate, table_schema_simple: Schema ) -> None: """Test that visiting an unbound OR expression with a bind-visitor returns the expected bound expression""" bound_expression = visit(unbound_or_expression, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) @@ -505,7 +505,7 @@ def test_or_expression_binding( ], ) def test_in_expression_binding( - unbound_in_expression: UnboundPredicate[Any], expected_bound_expression: BoundPredicate[Any], table_schema_simple: Schema + unbound_in_expression: UnboundPredicate, expected_bound_expression: BoundPredicate, table_schema_simple: Schema ) -> None: """Test that visiting an unbound IN expression with a bind-visitor returns the expected bound expression""" bound_expression = visit(unbound_in_expression, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) @@ -556,7 +556,7 @@ def test_in_expression_binding( ], ) def test_not_expression_binding( - unbound_not_expression: UnboundPredicate[Any], expected_bound_expression: BoundPredicate[Any], table_schema_simple: Schema + unbound_not_expression: UnboundPredicate, expected_bound_expression: BoundPredicate, table_schema_simple: Schema ) -> None: """Test that visiting an unbound NOT expression with a bind-visitor returns the expected bound expression""" bound_expression = visit(unbound_not_expression, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) @@ -1041,7 +1041,7 @@ def test_not_nan(schema: Schema, manifest: ManifestFile) -> None: def test_missing_stats(schema: Schema, manifest_no_stats: ManifestFile) -> None: - expressions: List[BooleanExpression] = [ + expressions: list[BooleanExpression] = [ LessThan(Reference("id"), 5), LessThanOrEqual(Reference("id"), 30), EqualTo(Reference("id"), 70), @@ -1590,16 +1590,16 @@ def test_to_dnf_not_and() -> None: def test_dnf_to_dask(table_schema_simple: Schema) -> None: expr = ( - BoundGreaterThan[str]( + BoundGreaterThan( term=BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)), literal=literal("hello"), ), And( - BoundIn[int]( + BoundIn( term=BoundReference(table_schema_simple.find_field(2), table_schema_simple.accessor_for_field(2)), literals={literal(1), literal(2), literal(3)}, ), - BoundEqualTo[bool]( + BoundEqualTo( term=BoundReference(table_schema_simple.find_field(3), table_schema_simple.accessor_for_field(3)), literal=literal(True), ), diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 653549ebb6..e78d4dfbe9 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -20,8 +20,8 @@ import os import re import threading +from collections.abc import Iterator from datetime import date -from typing import Iterator from unittest import mock import pyarrow as pa diff --git a/tests/integration/test_catalog.py b/tests/integration/test_catalog.py index 3590d0837e..0c77666568 100644 --- a/tests/integration/test_catalog.py +++ b/tests/integration/test_catalog.py @@ -16,8 +16,8 @@ # under the License. import os +from collections.abc import Generator from pathlib import Path, PosixPath -from typing import Generator, List import pytest @@ -171,7 +171,7 @@ def test_load_table(test_catalog: Catalog, table_schema_nested: Schema, database @pytest.mark.integration @pytest.mark.parametrize("test_catalog", CATALOGS) -def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: List[str]) -> None: +def test_list_tables(test_catalog: Catalog, table_schema_nested: Schema, database_name: str, table_list: list[str]) -> None: test_catalog.create_namespace(database_name) for table_name in table_list: test_catalog.create_table((database_name, table_name), table_schema_nested) @@ -443,7 +443,7 @@ def test_create_namespace_with_comment(test_catalog: Catalog, database_name: str @pytest.mark.integration @pytest.mark.parametrize("test_catalog", CATALOGS) -def test_list_namespaces(test_catalog: Catalog, database_list: List[str]) -> None: +def test_list_namespaces(test_catalog: Catalog, database_list: list[str]) -> None: for database_name in database_list: test_catalog.create_namespace(database_name) db_list = test_catalog.list_namespaces() diff --git a/tests/integration/test_delete_count.py b/tests/integration/test_delete_count.py index 0ba9d2d6da..bc787ff3c5 100644 --- a/tests/integration/test_delete_count.py +++ b/tests/integration/test_delete_count.py @@ -16,8 +16,8 @@ # under the License. # pylint:disable=redefined-outer-name import random +from collections.abc import Generator from datetime import datetime, timedelta -from typing import Generator, List import pyarrow as pa import pytest @@ -34,7 +34,7 @@ from pyiceberg.types import LongType, NestedField, StringType -def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: +def run_spark_commands(spark: SparkSession, sqls: list[str]) -> None: for sql in sqls: spark.sql(sql) diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index 21c3d12999..e3b487e465 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name +from collections.abc import Generator from datetime import datetime -from typing import Generator, List import pyarrow as pa import pytest @@ -34,7 +34,7 @@ from pyiceberg.types import FloatType, IntegerType, LongType, NestedField, StringType, TimestampType -def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: +def run_spark_commands(spark: SparkSession, sqls: list[str]) -> None: for sql in sqls: spark.sql(sql) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index fcc5dc0e35..0419fcf23a 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -17,7 +17,7 @@ # pylint:disable=redefined-outer-name from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, List +from typing import Any import pytest from pyspark.sql import SparkSession @@ -728,8 +728,8 @@ def test_partition_key( session_catalog: Catalog, spark: SparkSession, - partition_fields: List[PartitionField], - partition_values: List[Any], + partition_fields: list[PartitionField], + partition_values: list[Any], expected_partition_record: Record, expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 99116ad16f..785037aef3 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -1083,3 +1083,19 @@ def test_filter_after_arrow_scan(catalog: Catalog) -> None: scan = scan.filter("ts >= '2023-03-05T00:00:00+00:00'") assert len(scan.to_arrow()) > 0 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")]) +def test_scan_source_field_missing_in_spec(catalog: Catalog, spark: SparkSession) -> None: + identifier = "default.test_dropped_field" + spark.sql(f"DROP TABLE IF EXISTS {identifier}") + spark.sql(f"CREATE TABLE {identifier} (foo int, bar int, jaz string) USING ICEBERG PARTITIONED BY (foo, bar)") + spark.sql( + f"INSERT INTO {identifier} (foo, bar, jaz) VALUES (1, 1, 'dummy data'), (1, 2, 'dummy data again'), (2, 1, 'another partition')" + ) + spark.sql(f"ALTER TABLE {identifier} DROP PARTITION FIELD foo") + spark.sql(f"ALTER TABLE {identifier} DROP COLUMN foo") + + table = catalog.load_table(identifier) + assert len(list(table.scan().plan_files())) == 3 diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 5d7a3d9441..6c2bf7baed 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -20,7 +20,7 @@ from copy import copy from enum import Enum from tempfile import TemporaryDirectory -from typing import Any, List +from typing import Any import pytest from fastavro import reader @@ -36,7 +36,7 @@ # helper function to serialize our objects to dicts to enable # direct comparison with the dicts returned by fastavro -def todict(obj: Any, spec_keys: List[str]) -> Any: +def todict(obj: Any, spec_keys: list[str]) -> Any: if type(obj) is Record: return {key: obj[pos] for key, pos in zip(spec_keys, range(len(obj)), strict=True)} if isinstance(obj, dict) or isinstance(obj, LazyDict): diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index 1913f7beb7..d194669bd3 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -18,7 +18,7 @@ from datetime import date -from typing import Any, Set +from typing import Any import pyarrow as pa import pytest @@ -1038,7 +1038,7 @@ def test_append_transform_partition_verify_partitions_count( arrow_table_date_timestamps: pa.Table, table_date_timestamps_schema: Schema, transform: Transform[Any, Any], - expected_partitions: Set[Any], + expected_partitions: set[Any], format_version: int, ) -> None: # Given diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index e7bac5e3b8..835eda087c 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -24,7 +24,7 @@ from datetime import date, datetime, timedelta from decimal import Decimal from pathlib import Path -from typing import Any, Dict +from typing import Any from urllib.parse import urlparse import fastavro @@ -639,7 +639,7 @@ def test_write_parquet_compression_properties( session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int, - properties: Dict[str, Any], + properties: dict[str, Any], expected_compression_name: str, ) -> None: identifier = "default.write_parquet_compression_properties" @@ -674,8 +674,8 @@ def test_write_parquet_other_properties( spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, - properties: Dict[str, Any], - expected_kwargs: Dict[str, Any], + properties: dict[str, Any], + expected_kwargs: dict[str, Any], ) -> None: identifier = "default.test_write_parquet_other_properties" @@ -701,7 +701,7 @@ def test_write_parquet_unsupported_properties( spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, - properties: Dict[str, str], + properties: dict[str, str], ) -> None: identifier = "default.write_parquet_unsupported_properties" diff --git a/tests/integration/test_writes/utils.py b/tests/integration/test_writes/utils.py index ce30c19477..4ab54d97e7 100644 --- a/tests/integration/test_writes/utils.py +++ b/tests/integration/test_writes/utils.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name -from typing import List, Union +from typing import Union import pyarrow as pa @@ -63,7 +63,7 @@ def _create_table( session_catalog: Catalog, identifier: str, properties: Properties = EMPTY_DICT, - data: List[pa.Table] | None = None, + data: list[pa.Table] | None = None, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, schema: Union[Schema, "pa.Schema"] = TABLE_SCHEMA, ) -> Table: diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py index 7111aaa87c..c28eb0714f 100644 --- a/tests/io/test_fsspec.py +++ b/tests/io/test_fsspec.py @@ -20,7 +20,6 @@ import tempfile import threading import uuid -from typing import List from unittest import mock import pytest @@ -59,8 +58,8 @@ def test_fsspec_local_fs_can_create_path_without_parent_dir(fsspec_fileio: Fsspe def test_fsspec_get_fs_instance_per_thread_caching(fsspec_fileio: FsspecFileIO) -> None: """Test that filesystem instances are cached per-thread by `FsspecFileIO.get_fs`""" - fs_instances: List[AbstractFileSystem] = [] - start_work_events: List[threading.Event] = [threading.Event() for _ in range(2)] + fs_instances: list[AbstractFileSystem] = [] + start_work_events: list[threading.Event] = [threading.Event() for _ in range(2)] def get_fs(start_work_event: threading.Event) -> None: # Wait to be told to actually start getting the filesystem instances diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 3765ea6de6..869e60f4aa 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -22,7 +22,7 @@ import warnings from datetime import date, datetime, timezone from pathlib import Path -from typing import Any, List +from typing import Any from unittest.mock import MagicMock, patch from uuid import uuid4 @@ -633,12 +633,12 @@ def test_list_type_to_pyarrow() -> None: @pytest.fixture -def bound_reference(table_schema_simple: Schema) -> BoundReference[str]: +def bound_reference(table_schema_simple: Schema) -> BoundReference: return BoundReference(table_schema_simple.find_field(1), table_schema_simple.accessor_for_field(1)) @pytest.fixture -def bound_double_reference() -> BoundReference[float]: +def bound_double_reference() -> BoundReference: schema = Schema( NestedField(field_id=1, name="foo", field_type=DoubleType(), required=False), schema_id=1, @@ -647,68 +647,68 @@ def bound_double_reference() -> BoundReference[float]: return BoundReference(schema.find_field(1), schema.accessor_for_field(1)) -def test_expr_is_null_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_is_null_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundIsNull(bound_reference))) == "" ) -def test_expr_not_null_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_not_null_to_pyarrow(bound_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(BoundNotNull(bound_reference))) == "" -def test_expr_is_nan_to_pyarrow(bound_double_reference: BoundReference[str]) -> None: +def test_expr_is_nan_to_pyarrow(bound_double_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(BoundIsNaN(bound_double_reference))) == "" -def test_expr_not_nan_to_pyarrow(bound_double_reference: BoundReference[str]) -> None: +def test_expr_not_nan_to_pyarrow(bound_double_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(BoundNotNaN(bound_double_reference))) == "" -def test_expr_equal_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_equal_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundEqualTo(bound_reference, literal("hello")))) == '' ) -def test_expr_not_equal_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_not_equal_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundNotEqualTo(bound_reference, literal("hello")))) == '' ) -def test_expr_greater_than_or_equal_equal_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_greater_than_or_equal_equal_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundGreaterThanOrEqual(bound_reference, literal("hello")))) == '= "hello")>' ) -def test_expr_greater_than_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_greater_than_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundGreaterThan(bound_reference, literal("hello")))) == ' "hello")>' ) -def test_expr_less_than_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_less_than_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundLessThan(bound_reference, literal("hello")))) == '' ) -def test_expr_less_than_or_equal_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_less_than_or_equal_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundLessThanOrEqual(bound_reference, literal("hello")))) == '' ) -def test_expr_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_in_to_pyarrow(bound_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(BoundIn(bound_reference, {literal("hello"), literal("world")}))) in ( """ None: ) -def test_expr_not_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_not_in_to_pyarrow(bound_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(BoundNotIn(bound_reference, {literal("hello"), literal("world")}))) in ( """ None: ) -def test_expr_starts_with_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_starts_with_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundStartsWith(bound_reference, literal("he")))) == '' ) -def test_expr_not_starts_with_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_expr_not_starts_with_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(BoundNotStartsWith(bound_reference, literal("he")))) == '' ) -def test_and_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_and_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(And(BoundEqualTo(bound_reference, literal("hello")), BoundIsNull(bound_reference)))) == '' ) -def test_or_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_or_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(Or(BoundEqualTo(bound_reference, literal("hello")), BoundIsNull(bound_reference)))) == '' ) -def test_not_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_not_to_pyarrow(bound_reference: BoundReference) -> None: assert ( repr(expression_to_pyarrow(Not(BoundEqualTo(bound_reference, literal("hello"))))) == '' ) -def test_always_true_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_always_true_to_pyarrow(bound_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(AlwaysTrue())) == "" -def test_always_false_to_pyarrow(bound_reference: BoundReference[str]) -> None: +def test_always_false_to_pyarrow(bound_reference: BoundReference) -> None: assert repr(expression_to_pyarrow(AlwaysFalse())) == "" @@ -1014,7 +1014,7 @@ def file_map(schema_map: Schema, tmpdir: str) -> str: def project( - schema: Schema, files: List[str], expr: BooleanExpression | None = None, table_schema: Schema | None = None + schema: Schema, files: list[str], expr: BooleanExpression | None = None, table_schema: Schema | None = None ) -> pa.Table: def _set_spec_id(datafile: DataFile) -> DataFile: datafile.spec_id = 0 @@ -2160,7 +2160,7 @@ def test_make_compatible_name() -> None: ([None, None, None], DateType(), None), ], ) -def test_stats_aggregator_update_min(vals: List[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: +def test_stats_aggregator_update_min(vals: list[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: stats = StatsAggregator(primitive_type, _primitive_to_physical(primitive_type)) for val in vals: @@ -2180,7 +2180,7 @@ def test_stats_aggregator_update_min(vals: List[Any], primitive_type: PrimitiveT ([None, None, None], DateType(), None), ], ) -def test_stats_aggregator_update_max(vals: List[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: +def test_stats_aggregator_update_max(vals: list[Any], primitive_type: PrimitiveType, expected_result: Any) -> None: stats = StatsAggregator(primitive_type, _primitive_to_physical(primitive_type)) for val in vals: @@ -2804,7 +2804,7 @@ def test_pyarrow_io_multi_fs() -> None: class SomeRetryStrategy(AwsDefaultS3RetryStrategy): def __init__(self) -> None: super().__init__() - warnings.warn("Initialized SomeRetryStrategy 👍") + warnings.warn("Initialized SomeRetryStrategy 👍", stacklevel=2) def test_retry_strategy() -> None: @@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None: id_field_no_ids = arrow_schema_no_ids.field(0) name_field_no_ids = arrow_schema_no_ids.field(1) - assert not id_field_no_ids.metadata - assert not name_field_no_ids.metadata + assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata + assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata + assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata + assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata + + +def test_orc_schema_conversion_with_required_attribute() -> None: + """ + Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute. + To run just this test: + pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute + """ + from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow + from pyiceberg.manifest import FileFormat + from pyiceberg.schema import Schema + from pyiceberg.types import IntegerType, StringType + + # Define schema + schema = Schema( + NestedField(1, "id", IntegerType(), required=True), + NestedField(2, "name", StringType(), required=False), + ) + + # Test 1: Specify Parquet format + arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET) + + id_field = arrow_schema_default.field(0) + name_field = arrow_schema_default.field(1) + + assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata + assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata + + # Test 2: Specify ORC format + arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC) + + id_field_orc = arrow_schema_orc.field(0) + name_field_orc = arrow_schema_orc.field(1) + + assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true" + assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false" def test_orc_batching_behavior_documentation(tmp_path: Path) -> None: diff --git a/tests/io/test_pyarrow_stats.py b/tests/io/test_pyarrow_stats.py index fd175cae60..0e628829eb 100644 --- a/tests/io/test_pyarrow_stats.py +++ b/tests/io/test_pyarrow_stats.py @@ -30,9 +30,6 @@ from decimal import Decimal from typing import ( Any, - Dict, - List, - Tuple, ) import pyarrow as pa @@ -81,8 +78,8 @@ class TestStruct: def construct_test_table( - write_statistics: bool | List[str] = True, -) -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: + write_statistics: bool | list[str] = True, +) -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -143,7 +140,7 @@ def construct_test_table( _list = [[1, 2, 3], [4, 5, 6], None, [7, 8, 9]] - _maps: List[Dict[int, int] | None] = [ + _maps: list[dict[int, int] | None] = [ {1: 2, 3: 4}, None, {5: 6}, @@ -167,7 +164,7 @@ def construct_test_table( }, schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter( @@ -422,7 +419,7 @@ def test_column_metrics_mode() -> None: assert 1 not in datafile.upper_bounds -def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: +def construct_test_table_primitive_types() -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -506,7 +503,7 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, TableMetada schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, store_decimal_as_integer=True) as writer: @@ -576,7 +573,7 @@ def test_metrics_primitive_types() -> None: assert not any(key in datafile.upper_bounds.keys() for key in [16, 17, 18]) -def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: +def construct_test_table_invalid_upper_bound() -> tuple[pq.FileMetaData, TableMetadataV1 | TableMetadataV2]: table_metadata = { "format-version": 2, "location": "s3://bucket/test/location", @@ -618,7 +615,7 @@ def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, TableMe schema=arrow_schema, ) - metadata_collector: List[Any] = [] + metadata_collector: list[Any] = [] with pa.BufferOutputStream() as f: with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 9d5772d01c..59a4857699 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=protected-access,unused-argument,redefined-outer-name import re -from typing import Any import pyarrow as pa import pytest @@ -717,21 +716,21 @@ def test_pyarrow_schema_round_trip_ensure_large_types_and_then_small_types(pyarr @pytest.fixture -def bound_reference_str() -> BoundReference[Any]: +def bound_reference_str() -> BoundReference: return BoundReference( field=NestedField(1, "string_field", StringType(), required=False), accessor=Accessor(position=0, inner=None) ) @pytest.fixture -def bound_reference_float() -> BoundReference[Any]: +def bound_reference_float() -> BoundReference: return BoundReference( field=NestedField(2, "float_field", FloatType(), required=False), accessor=Accessor(position=1, inner=None) ) @pytest.fixture -def bound_reference_double() -> BoundReference[Any]: +def bound_reference_double() -> BoundReference: return BoundReference( field=NestedField(3, "double_field", DoubleType(), required=False), accessor=Accessor(position=2, inner=None), @@ -739,32 +738,32 @@ def bound_reference_double() -> BoundReference[Any]: @pytest.fixture -def bound_eq_str_field(bound_reference_str: BoundReference[Any]) -> BoundEqualTo[Any]: +def bound_eq_str_field(bound_reference_str: BoundReference) -> BoundEqualTo: return BoundEqualTo(term=bound_reference_str, literal=literal("hello")) @pytest.fixture -def bound_greater_than_float_field(bound_reference_float: BoundReference[Any]) -> BoundGreaterThan[Any]: +def bound_greater_than_float_field(bound_reference_float: BoundReference) -> BoundGreaterThan: return BoundGreaterThan(term=bound_reference_float, literal=literal(100)) @pytest.fixture -def bound_is_nan_float_field(bound_reference_float: BoundReference[Any]) -> BoundIsNaN[Any]: +def bound_is_nan_float_field(bound_reference_float: BoundReference) -> BoundIsNaN: return BoundIsNaN(bound_reference_float) @pytest.fixture -def bound_eq_double_field(bound_reference_double: BoundReference[Any]) -> BoundEqualTo[Any]: +def bound_eq_double_field(bound_reference_double: BoundReference) -> BoundEqualTo: return BoundEqualTo(term=bound_reference_double, literal=literal(False)) @pytest.fixture -def bound_is_null_double_field(bound_reference_double: BoundReference[Any]) -> BoundIsNull[Any]: +def bound_is_null_double_field(bound_reference_double: BoundReference) -> BoundIsNull: return BoundIsNull(bound_reference_double) def test_collect_null_nan_unmentioned_terms( - bound_eq_str_field: BoundEqualTo[Any], bound_is_nan_float_field: BoundIsNaN[Any], bound_is_null_double_field: BoundIsNull[Any] + bound_eq_str_field: BoundEqualTo, bound_is_nan_float_field: BoundIsNaN, bound_is_null_double_field: BoundIsNull ) -> None: bound_expr = And( Or(And(bound_eq_str_field, bound_is_nan_float_field), bound_is_null_double_field), Not(bound_is_nan_float_field) @@ -786,11 +785,11 @@ def test_collect_null_nan_unmentioned_terms( def test_collect_null_nan_unmentioned_terms_with_multiple_predicates_on_the_same_term( - bound_eq_str_field: BoundEqualTo[Any], - bound_greater_than_float_field: BoundGreaterThan[Any], - bound_is_nan_float_field: BoundIsNaN[Any], - bound_eq_double_field: BoundEqualTo[Any], - bound_is_null_double_field: BoundIsNull[Any], + bound_eq_str_field: BoundEqualTo, + bound_greater_than_float_field: BoundGreaterThan, + bound_is_nan_float_field: BoundIsNaN, + bound_eq_double_field: BoundEqualTo, + bound_is_null_double_field: BoundIsNull, ) -> None: """Test a single term appears multiple places in the expression tree""" bound_expr = And( @@ -818,11 +817,11 @@ def test_collect_null_nan_unmentioned_terms_with_multiple_predicates_on_the_same def test_expression_to_complementary_pyarrow( - bound_eq_str_field: BoundEqualTo[Any], - bound_greater_than_float_field: BoundGreaterThan[Any], - bound_is_nan_float_field: BoundIsNaN[Any], - bound_eq_double_field: BoundEqualTo[Any], - bound_is_null_double_field: BoundIsNull[Any], + bound_eq_str_field: BoundEqualTo, + bound_greater_than_float_field: BoundGreaterThan, + bound_is_nan_float_field: BoundIsNaN, + bound_eq_double_field: BoundEqualTo, + bound_is_null_double_field: BoundIsNull, ) -> None: bound_expr = And( Or( diff --git a/tests/table/test_expire_snapshots.py b/tests/table/test_expire_snapshots.py index d11851f246..106e5b786c 100644 --- a/tests/table/test_expire_snapshots.py +++ b/tests/table/test_expire_snapshots.py @@ -16,7 +16,6 @@ # under the License. import threading from datetime import datetime, timedelta -from typing import Dict from unittest.mock import MagicMock, Mock from uuid import uuid4 @@ -253,7 +252,7 @@ def test_thread_safety_fix() -> None: def test_concurrent_operations() -> None: """Test concurrent operations with separate ExpireSnapshots instances.""" - results: Dict[str, set[int]] = {"expire1_snapshots": set(), "expire2_snapshots": set()} + results: dict[str, set[int]] = {"expire1_snapshots": set(), "expire2_snapshots": set()} def worker1() -> None: expire1 = ExpireSnapshots(Mock()) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 5cc68b62a4..37d7f46e38 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -18,7 +18,7 @@ import json import uuid from copy import copy -from typing import Any, Dict +from typing import Any import pytest from pydantic import ValidationError @@ -545,7 +545,7 @@ def test_update_column(table_v1: Table, table_v2: Table) -> None: def test_add_primitive_type_column(table_v2: Table) -> None: - primitive_type: Dict[str, PrimitiveType] = { + primitive_type: dict[str, PrimitiveType] = { "boolean": BooleanType(), "int": IntegerType(), "long": LongType(), @@ -1221,7 +1221,7 @@ def test_correct_schema() -> None: assert "Snapshot not found: -1" in str(exc_info.value) -def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_table_properties(example_table_metadata_v2: dict[str, Any]) -> None: # metadata properties are all strings for k, v in example_table_metadata_v2["properties"].items(): assert isinstance(k, str) @@ -1239,7 +1239,7 @@ def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: assert isinstance(new_metadata.properties["property_name"], str) -def test_table_properties_raise_for_none_value(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_table_properties_raise_for_none_value(example_table_metadata_v2: dict[str, Any]) -> None: property_with_none = {"property_name": None} example_table_metadata_v2 = {**example_table_metadata_v2, "properties": property_with_none} with pytest.raises(ValidationError) as exc_info: diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index 9141189ec5..c163c90626 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -19,7 +19,7 @@ import io import json from copy import copy -from typing import Any, Dict +from typing import Any from unittest.mock import MagicMock, patch from uuid import UUID @@ -57,34 +57,34 @@ ) -def test_from_dict_v1(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_from_dict_v1(example_table_metadata_v1: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a dictionary""" TableMetadataUtil.parse_obj(example_table_metadata_v1) -def test_from_dict_v1_parse_raw(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_from_dict_v1_parse_raw(example_table_metadata_v1: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a str""" TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v1)) -def test_from_dict_v2(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_dict_v2(example_table_metadata_v2: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a dictionary""" TableMetadataUtil.parse_obj(example_table_metadata_v2) -def test_from_dict_v2_parse_raw(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_dict_v2_parse_raw(example_table_metadata_v2: dict[str, Any]) -> None: """Test initialization of a TableMetadata instance from a str""" TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v2)) -def test_from_byte_stream(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_from_byte_stream(example_table_metadata_v2: dict[str, Any]) -> None: """Test generating a TableMetadata instance from a file-like byte stream""" data = bytes(json.dumps(example_table_metadata_v2), encoding=UTF8) byte_stream = io.BytesIO(data) FromByteStream.table_metadata(byte_stream=byte_stream) -def test_v2_metadata_parsing(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_v2_metadata_parsing(example_table_metadata_v2: dict[str, Any]) -> None: """Test retrieving values from a TableMetadata instance of version 2""" table_metadata = TableMetadataUtil.parse_obj(example_table_metadata_v2) @@ -107,7 +107,7 @@ def test_v2_metadata_parsing(example_table_metadata_v2: Dict[str, Any]) -> None: assert table_metadata.default_sort_order_id == 3 -def test_v1_metadata_parsing_directly(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_metadata_parsing_directly(example_table_metadata_v1: dict[str, Any]) -> None: """Test retrieving values from a TableMetadata instance of version 1""" table_metadata = TableMetadataV1(**example_table_metadata_v1) @@ -138,14 +138,14 @@ def test_v1_metadata_parsing_directly(example_table_metadata_v1: Dict[str, Any]) assert table_metadata.default_sort_order_id == 0 -def test_parsing_correct_types(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_parsing_correct_types(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2) assert isinstance(table_metadata.schemas[0], Schema) assert isinstance(table_metadata.schemas[0].fields[0], NestedField) assert isinstance(table_metadata.schemas[0].fields[0].field_type, LongType) -def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_updating_metadata(example_table_metadata_v2: dict[str, Any]) -> None: """Test creating a new TableMetadata instance that's an updated version of an existing TableMetadata instance""" table_metadata = TableMetadataV2(**example_table_metadata_v2) @@ -170,20 +170,20 @@ def test_updating_metadata(example_table_metadata_v2: Dict[str, Any]) -> None: assert table_metadata.schemas[-1] == Schema(**new_schema) -def test_serialize_v1(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_serialize_v1(example_table_metadata_v1: dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) table_metadata_json = table_metadata.model_dump_json() expected = """{"location":"s3://bucket/test/location","table-uuid":"d20125c8-7284-442c-9aea-15fee620737c","last-updated-ms":1602638573874,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}],"current-schema-id":0,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{},"snapshots":[{"snapshot-id":1925,"timestamp-ms":1602638573822,"manifest-list":"s3://bucket/test/manifest-list"}],"snapshot-log":[],"metadata-log":[],"sort-orders":[{"order-id":0,"fields":[]}],"default-sort-order-id":0,"refs":{},"statistics":[],"partition-statistics":[],"format-version":1,"schema":{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},"partition-spec":[{"name":"x","transform":"identity","source-id":1,"field-id":1000}]}""" assert table_metadata_json == expected -def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_serialize_v2(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2).model_dump_json() expected = """{"location":"s3://bucket/test/location","table-uuid":"9c12d441-03fe-4693-9a96-a0705ddf69c1","last-updated-ms":1602638573590,"last-column-id":3,"schemas":[{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]},{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":1,"identifier-field-ids":[1,2]}],"current-schema-id":1,"partition-specs":[{"spec-id":0,"fields":[{"source-id":1,"field-id":1000,"transform":"identity","name":"x"}]}],"default-spec-id":0,"last-partition-id":1000,"properties":{"read.split.target.size":"134217728"},"current-snapshot-id":3055729675574597004,"snapshots":[{"snapshot-id":3051729675574597004,"sequence-number":0,"timestamp-ms":1515100955770,"manifest-list":"s3://a/b/1.avro","summary":{"operation":"append"}},{"snapshot-id":3055729675574597004,"parent-snapshot-id":3051729675574597004,"sequence-number":1,"timestamp-ms":1555100955770,"manifest-list":"s3://a/b/2.avro","summary":{"operation":"append"},"schema-id":1}],"snapshot-log":[{"snapshot-id":3051729675574597004,"timestamp-ms":1515100955770},{"snapshot-id":3055729675574597004,"timestamp-ms":1555100955770}],"metadata-log":[{"metadata-file":"s3://bucket/.../v1.json","timestamp-ms":1515100}],"sort-orders":[{"order-id":3,"fields":[{"source-id":2,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":3,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"}]}],"default-sort-order-id":3,"refs":{"test":{"snapshot-id":3051729675574597004,"type":"tag","max-ref-age-ms":10000000},"main":{"snapshot-id":3055729675574597004,"type":"branch"}},"statistics":[],"partition-statistics":[],"format-version":2,"last-sequence-number":34}""" assert table_metadata == expected -def test_serialize_v3(example_table_metadata_v3: Dict[str, Any]) -> None: +def test_serialize_v3(example_table_metadata_v3: dict[str, Any]) -> None: # Writing will be part of https://github.com/apache/iceberg-python/issues/1551 with pytest.raises(NotImplementedError) as exc_info: @@ -192,7 +192,7 @@ def test_serialize_v3(example_table_metadata_v3: Dict[str, Any]) -> None: assert "Writing V3 is not yet supported, see: https://github.com/apache/iceberg-python/issues/1551" in str(exc_info.value) -def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_migrate_v1_schemas(example_table_metadata_v1: dict[str, Any]) -> None: table_metadata = TableMetadataV1(**example_table_metadata_v1) assert isinstance(table_metadata, TableMetadataV1) @@ -200,7 +200,7 @@ def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None: assert table_metadata.schemas[0] == table_metadata.schema_ -def test_migrate_v1_partition_specs(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_migrate_v1_partition_specs(example_table_metadata_v1: dict[str, Any]) -> None: # Copy the example, and add a spec table_metadata = TableMetadataV1(**example_table_metadata_v1) assert isinstance(table_metadata, TableMetadataV1) @@ -281,7 +281,7 @@ def test_new_table_metadata_with_explicit_v1_format() -> None: assert actual.sort_orders == [expected_sort_order] -def test_invalid_format_version(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_invalid_format_version(example_table_metadata_v1: dict[str, Any]) -> None: """Test the exception when trying to load an unknown version""" example_table_metadata_v22 = copy(example_table_metadata_v1) @@ -449,7 +449,7 @@ def test_invalid_partition_spec() -> None: assert "default-spec-id 1 can't be found" in str(exc_info.value) -def test_v1_writing_metadata(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_writing_metadata(example_table_metadata_v1: dict[str, Any]) -> None: """ https://iceberg.apache.org/spec/#version-2 @@ -464,7 +464,7 @@ def test_v1_writing_metadata(example_table_metadata_v1: Dict[str, Any]) -> None: assert "last-sequence-number" not in metadata_v1 -def test_v1_metadata_for_v2(example_table_metadata_v1: Dict[str, Any]) -> None: +def test_v1_metadata_for_v2(example_table_metadata_v1: dict[str, Any]) -> None: """ https://iceberg.apache.org/spec/#version-2 @@ -548,7 +548,7 @@ def test_v1_write_metadata_for_v2() -> None: assert "partition-spec" not in metadata_v2 -def test_v2_ref_creation(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_v2_ref_creation(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataV2(**example_table_metadata_v2) assert table_metadata.refs == { "main": SnapshotRef( diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 0fe22391c0..576297c6f2 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -47,6 +47,7 @@ TimestampType, TimestamptzType, TimeType, + UnknownType, UUIDType, ) @@ -165,6 +166,28 @@ def test_partition_spec_to_path() -> None: assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" +def test_partition_spec_to_path_dropped_source_id() -> None: + schema = Schema( + NestedField(field_id=1, name="str", field_type=StringType(), required=False), + NestedField(field_id=2, name="other_str", field_type=StringType(), required=False), + NestedField(field_id=3, name="int", field_type=IntegerType(), required=True), + ) + + spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="my#str%bucket"), + PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="other str+bucket"), + # Point partition field to missing source id + PartitionField(source_id=4, field_id=1002, transform=BucketTransform(num_buckets=25), name="my!int:bucket"), + spec_id=3, + ) + + record = Record("my+str", "( )", 10) + + # Both partition field names and values should be URL encoded, with spaces mapping to plus signs, to match the Java + # behaviour: https://github.com/apache/iceberg/blob/ca3db931b0f024f0412084751ac85dd4ef2da7e7/api/src/main/java/org/apache/iceberg/PartitionSpec.java#L198-L204 + assert spec.partition_to_path(record, schema) == "my%23str%25bucket=my%2Bstr/other+str%2Bbucket=%28+%29/my%21int%3Abucket=10" + + def test_partition_type(table_schema_simple: Schema) -> None: spec = PartitionSpec( PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"), @@ -178,6 +201,19 @@ def test_partition_type(table_schema_simple: Schema) -> None: ) +def test_partition_type_missing_source_field(table_schema_simple: Schema) -> None: + spec = PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate"), + PartitionField(source_id=10, field_id=1001, transform=BucketTransform(num_buckets=25), name="int_bucket"), + spec_id=3, + ) + + assert spec.partition_type(table_schema_simple) == StructType( + NestedField(field_id=1000, name="str_truncate", field_type=StringType(), required=False), + NestedField(field_id=1001, name="int_bucket", field_type=UnknownType(), required=False), + ) + + @pytest.mark.parametrize( "source_type, value", [ diff --git a/tests/table/test_puffin.py b/tests/table/test_puffin.py index 2140915389..bf8c82014c 100644 --- a/tests/table/test_puffin.py +++ b/tests/table/test_puffin.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. from os import path -from typing import List import pytest from pyroaring import BitMap @@ -32,7 +31,7 @@ def _open_file(file: str) -> bytes: def test_map_empty() -> None: puffin = _open_file("64mapempty.bin") - expected: List[BitMap] = [] + expected: list[BitMap] = [] actual = _deserialize_bitmap(puffin) assert expected == actual diff --git a/tests/table/test_sorting.py b/tests/table/test_sorting.py index 3efda56509..cb7a2c187a 100644 --- a/tests/table/test_sorting.py +++ b/tests/table/test_sorting.py @@ -16,7 +16,7 @@ # under the License. # pylint:disable=redefined-outer-name,eval-used import json -from typing import Any, Dict +from typing import Any import pytest @@ -63,7 +63,7 @@ def test_deserialize_sort_order(sort_order: SortOrder) -> None: assert SortOrder.model_validate_json(payload) == sort_order -def test_sorting_schema(example_table_metadata_v2: Dict[str, Any]) -> None: +def test_sorting_schema(example_table_metadata_v2: dict[str, Any]) -> None: table_metadata = TableMetadataUtil.parse_raw(json.dumps(example_table_metadata_v2)) assert table_metadata.sort_orders == [ diff --git a/tests/test_avro_sanitization.py b/tests/test_avro_sanitization.py index 0ca23e3165..a053bf90ad 100644 --- a/tests/test_avro_sanitization.py +++ b/tests/test_avro_sanitization.py @@ -18,7 +18,7 @@ import tempfile -from typing import Any, Dict +from typing import Any from fastavro import reader @@ -72,7 +72,7 @@ def test_comprehensive_field_name_sanitization() -> None: schema = Schema(NestedField(field_id=1, name=original_name, field_type=StringType(), required=True)) avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema) - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema assert avro_dict["fields"][0]["name"] == expected_sanitized @@ -126,7 +126,7 @@ def test_comprehensive_avro_compatibility() -> None: avro_reader = reader(fo) avro_schema: AvroType = avro_reader.writer_schema - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema field_names = [field["name"] for field in avro_dict["fields"]] # Expected sanitized names (matching Java implementation) @@ -143,7 +143,7 @@ def test_comprehensive_avro_compatibility() -> None: # Verify iceberg-field-name properties for field in avro_dict["fields"]: - field_dict: Dict[str, Any] = field + field_dict: dict[str, Any] = field if field_dict["name"] == "invalid_x2Efield": assert "iceberg-field-name" in field_dict assert field_dict["iceberg-field-name"] == "invalid.field" @@ -201,7 +201,7 @@ def test_emoji_field_name_sanitization() -> None: ) avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="emoji_test") - avro_dict: Dict[str, Any] = avro_schema + avro_dict: dict[str, Any] = avro_schema field_names = [field["name"] for field in avro_dict["fields"]] expected_field_names = [ @@ -213,7 +213,7 @@ def test_emoji_field_name_sanitization() -> None: assert field_names == expected_field_names for field in avro_dict["fields"]: - field_dict: Dict[str, Any] = field + field_dict: dict[str, Any] = field if field_dict["name"] == "_x1F60E": assert field_dict["iceberg-field-name"] == "😎" elif field_dict["name"] == "_x1F60E_with_text": @@ -240,13 +240,13 @@ def test_emoji_field_name_sanitization() -> None: avro_reader = reader(fo) avro_schema_reader: AvroType = avro_reader.writer_schema - avro_dict_reader: Dict[str, Any] = avro_schema_reader + avro_dict_reader: dict[str, Any] = avro_schema_reader field_names_reader = [field["name"] for field in avro_dict_reader["fields"]] assert field_names_reader == expected_field_names for field in avro_dict_reader["fields"]: - field_dict_reader: Dict[str, Any] = field + field_dict_reader: dict[str, Any] = field if field_dict_reader["name"] == "_x1F60E": assert field_dict_reader["iceberg-field-name"] == "😎" elif field_dict_reader["name"] == "_x1F60E_with_text": diff --git a/tests/test_schema.py b/tests/test_schema.py index e0dba59eaa..589a45c3b4 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -16,7 +16,7 @@ # under the License. from textwrap import dedent -from typing import Any, Dict, List +from typing import Any import pyarrow as pa import pytest @@ -409,8 +409,8 @@ def test_build_position_accessors(table_schema_nested: Schema) -> None: def test_build_position_accessors_with_struct(table_schema_nested: Schema) -> None: class TestStruct(StructProtocol): - def __init__(self, pos: Dict[int, Any] = EMPTY_DICT): - self._pos: Dict[int, Any] = pos + def __init__(self, pos: dict[int, Any] = EMPTY_DICT): + self._pos: dict[int, Any] = pos def __setitem__(self, pos: int, value: Any) -> None: pass @@ -952,14 +952,14 @@ def test_unknown_type_promotion_to_non_primitive_raises_resolve_error() -> None: @pytest.fixture() -def primitive_fields() -> List[NestedField]: +def primitive_fields() -> list[NestedField]: return [ NestedField(field_id=1, name=str(primitive_type), field_type=primitive_type, required=False) for primitive_type in TEST_PRIMITIVE_TYPES ] -def test_add_top_level_primitives(primitive_fields: List[NestedField], table_v2: Table) -> None: +def test_add_top_level_primitives(primitive_fields: list[NestedField], table_v2: Table) -> None: for primitive_field in primitive_fields: new_schema = Schema(primitive_field) applied = UpdateSchema(transaction=Transaction(table_v2), schema=Schema()).union_by_name(new_schema)._apply() @@ -1025,7 +1025,7 @@ def test_add_nested_primitive(primitive_fields: NestedField, table_v2: Table) -> assert applied.as_struct() == new_schema.as_struct() -def _primitive_fields(types: List[PrimitiveType], start_id: int = 0) -> List[NestedField]: +def _primitive_fields(types: list[PrimitiveType], start_id: int = 0) -> list[NestedField]: fields = [] for iceberg_type in types: fields.append(NestedField(field_id=start_id, name=str(iceberg_type), field_type=iceberg_type, required=False)) diff --git a/tests/test_serializers.py b/tests/test_serializers.py index 3f2bd73e48..53ce6fcd42 100644 --- a/tests/test_serializers.py +++ b/tests/test_serializers.py @@ -18,7 +18,7 @@ import json import os import uuid -from typing import Any, Dict, Tuple +from typing import Any import pytest from pytest_mock import MockFixture @@ -31,7 +31,7 @@ def test_legacy_current_snapshot_id( - mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: Dict[str, Any] + mocker: MockFixture, tmp_path_factory: pytest.TempPathFactory, example_table_metadata_no_snapshot_v1: dict[str, Any] ) -> None: from pyiceberg.io.pyarrow import PyArrowFileIO @@ -54,7 +54,7 @@ def test_legacy_current_snapshot_id( def test_null_serializer_field() -> None: class ExampleRequest(IcebergBaseModel): - requirements: Tuple[TableRequirement, ...] + requirements: tuple[TableRequirement, ...] request = ExampleRequest(requirements=(AssertRefSnapshotId(ref="main", snapshot_id=None),)) dumped_json = request.model_dump_json() diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 3d9bfcb555..96500907cf 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -16,9 +16,10 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=eval-used,protected-access,redefined-outer-name +from collections.abc import Callable from datetime import date from decimal import Decimal -from typing import Annotated, Any, Callable +from typing import Annotated, Any from uuid import UUID import mmh3 as mmh3 @@ -91,7 +92,7 @@ YearTransform, parse_transform, ) -from pyiceberg.typedef import UTF8, L +from pyiceberg.typedef import UTF8 from pyiceberg.types import ( BinaryType, BooleanType, @@ -650,146 +651,146 @@ def test_datetime_transform_repr(transform: TimeTransform[Any], transform_repr: @pytest.fixture -def bound_reference_date() -> BoundReference[int]: +def bound_reference_date() -> BoundReference: return BoundReference(field=NestedField(1, "field", DateType(), required=False), accessor=Accessor(position=0, inner=None)) @pytest.fixture -def bound_reference_timestamp() -> BoundReference[int]: +def bound_reference_timestamp() -> BoundReference: return BoundReference( field=NestedField(1, "field", TimestampType(), required=False), accessor=Accessor(position=0, inner=None) ) @pytest.fixture -def bound_reference_decimal() -> BoundReference[Decimal]: +def bound_reference_decimal() -> BoundReference: return BoundReference( field=NestedField(1, "field", DecimalType(8, 2), required=False), accessor=Accessor(position=0, inner=None) ) @pytest.fixture -def bound_reference_int() -> BoundReference[int]: +def bound_reference_int() -> BoundReference: return BoundReference(field=NestedField(1, "field", IntegerType(), required=False), accessor=Accessor(position=0, inner=None)) @pytest.fixture -def bound_reference_long() -> BoundReference[int]: +def bound_reference_long() -> BoundReference: return BoundReference(field=NestedField(1, "field", LongType(), required=False), accessor=Accessor(position=0, inner=None)) -def test_projection_bucket_unary(bound_reference_str: BoundReference[str]) -> None: +def test_projection_bucket_unary(bound_reference_str: BoundReference) -> None: assert BucketTransform(2).project("name", BoundNotNull(term=bound_reference_str)) == NotNull(term=Reference(name="name")) -def test_projection_bucket_literal(bound_reference_str: BoundReference[str]) -> None: +def test_projection_bucket_literal(bound_reference_str: BoundReference) -> None: assert BucketTransform(2).project("name", BoundEqualTo(term=bound_reference_str, literal=literal("data"))) == EqualTo( term="name", literal=1 ) -def test_projection_bucket_set_same_bucket(bound_reference_str: BoundReference[str]) -> None: +def test_projection_bucket_set_same_bucket(bound_reference_str: BoundReference) -> None: assert BucketTransform(2).project( "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("world")}) ) == EqualTo(term="name", literal=1) -def test_projection_bucket_set_in(bound_reference_str: BoundReference[str]) -> None: +def test_projection_bucket_set_in(bound_reference_str: BoundReference) -> None: assert BucketTransform(3).project( "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("world")}) ) == In(term="name", literals={1, 2}) -def test_projection_bucket_set_not_in(bound_reference_str: BoundReference[str]) -> None: +def test_projection_bucket_set_not_in(bound_reference_str: BoundReference) -> None: assert ( BucketTransform(3).project("name", BoundNotIn(term=bound_reference_str, literals={literal("hello"), literal("world")})) is None ) -def test_projection_year_unary(bound_reference_date: BoundReference[int]) -> None: +def test_projection_year_unary(bound_reference_date: BoundReference) -> None: assert YearTransform().project("name", BoundNotNull(term=bound_reference_date)) == NotNull(term="name") -def test_projection_year_literal(bound_reference_date: BoundReference[int]) -> None: +def test_projection_year_literal(bound_reference_date: BoundReference) -> None: assert YearTransform().project("name", BoundEqualTo(term=bound_reference_date, literal=DateLiteral(1925))) == EqualTo( term="name", literal=5 ) -def test_projection_year_set_same_year(bound_reference_date: BoundReference[int]) -> None: +def test_projection_year_set_same_year(bound_reference_date: BoundReference) -> None: assert YearTransform().project( "name", BoundIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(1926)}) ) == EqualTo(term="name", literal=5) -def test_projection_year_set_in(bound_reference_date: BoundReference[int]) -> None: +def test_projection_year_set_in(bound_reference_date: BoundReference) -> None: assert YearTransform().project( "name", BoundIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(2925)}) ) == In(term="name", literals={8, 5}) -def test_projection_year_set_not_in(bound_reference_date: BoundReference[int]) -> None: +def test_projection_year_set_not_in(bound_reference_date: BoundReference) -> None: assert ( YearTransform().project("name", BoundNotIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(2925)})) is None ) -def test_projection_month_unary(bound_reference_date: BoundReference[int]) -> None: +def test_projection_month_unary(bound_reference_date: BoundReference) -> None: assert MonthTransform().project("name", BoundNotNull(term=bound_reference_date)) == NotNull(term="name") -def test_projection_month_literal(bound_reference_date: BoundReference[int]) -> None: +def test_projection_month_literal(bound_reference_date: BoundReference) -> None: assert MonthTransform().project("name", BoundEqualTo(term=bound_reference_date, literal=DateLiteral(1925))) == EqualTo( term="name", literal=63 ) -def test_projection_month_set_same_month(bound_reference_date: BoundReference[int]) -> None: +def test_projection_month_set_same_month(bound_reference_date: BoundReference) -> None: assert MonthTransform().project( "name", BoundIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(1926)}) ) == EqualTo(term="name", literal=63) -def test_projection_month_set_in(bound_reference_date: BoundReference[int]) -> None: +def test_projection_month_set_in(bound_reference_date: BoundReference) -> None: assert MonthTransform().project( "name", BoundIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(2925)}) ) == In(term="name", literals={96, 63}) -def test_projection_day_month_not_in(bound_reference_date: BoundReference[int]) -> None: +def test_projection_day_month_not_in(bound_reference_date: BoundReference) -> None: assert ( MonthTransform().project("name", BoundNotIn(term=bound_reference_date, literals={DateLiteral(1925), DateLiteral(2925)})) is None ) -def test_projection_day_unary(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_day_unary(bound_reference_timestamp: BoundReference) -> None: assert DayTransform().project("name", BoundNotNull(term=bound_reference_timestamp)) == NotNull(term="name") -def test_projection_day_literal(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_day_literal(bound_reference_timestamp: BoundReference) -> None: assert DayTransform().project( "name", BoundEqualTo(term=bound_reference_timestamp, literal=TimestampLiteral(1667696874000)) ) == EqualTo(term="name", literal=19) -def test_projection_day_set_same_day(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_day_set_same_day(bound_reference_timestamp: BoundReference) -> None: assert DayTransform().project( "name", BoundIn(term=bound_reference_timestamp, literals={TimestampLiteral(1667696874001), TimestampLiteral(1667696874000)}), ) == EqualTo(term="name", literal=19) -def test_projection_day_set_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_day_set_in(bound_reference_timestamp: BoundReference) -> None: assert DayTransform().project( "name", BoundIn(term=bound_reference_timestamp, literals={TimestampLiteral(1667696874001), TimestampLiteral(1567696874000)}), ) == In(term="name", literals={18, 19}) -def test_projection_day_set_not_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_day_set_not_in(bound_reference_timestamp: BoundReference) -> None: assert ( DayTransform().project( "name", @@ -799,7 +800,7 @@ def test_projection_day_set_not_in(bound_reference_timestamp: BoundReference[int ) -def test_projection_day_human(bound_reference_date: BoundReference[int]) -> None: +def test_projection_day_human(bound_reference_date: BoundReference) -> None: date_literal = DateLiteral(17532) assert DayTransform().project("dt", BoundEqualTo(term=bound_reference_date, literal=date_literal)) == EqualTo( term="dt", literal=17532 @@ -822,7 +823,7 @@ def test_projection_day_human(bound_reference_date: BoundReference[int]) -> None ) # >= 2018, 1, 2 -def test_projection_hour_unary(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_hour_unary(bound_reference_timestamp: BoundReference) -> None: assert HourTransform().project("name", BoundNotNull(term=bound_reference_timestamp)) == NotNull(term="name") @@ -830,13 +831,13 @@ def test_projection_hour_unary(bound_reference_timestamp: BoundReference[int]) - HOUR_IN_MICROSECONDS = 60 * 60 * 1000 * 1000 -def test_projection_hour_literal(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_hour_literal(bound_reference_timestamp: BoundReference) -> None: assert HourTransform().project( "name", BoundEqualTo(term=bound_reference_timestamp, literal=TimestampLiteral(TIMESTAMP_EXAMPLE)) ) == EqualTo(term="name", literal=463249) -def test_projection_hour_set_same_hour(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_hour_set_same_hour(bound_reference_timestamp: BoundReference) -> None: assert HourTransform().project( "name", BoundIn( @@ -846,7 +847,7 @@ def test_projection_hour_set_same_hour(bound_reference_timestamp: BoundReference ) == EqualTo(term="name", literal=463249) -def test_projection_hour_set_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_hour_set_in(bound_reference_timestamp: BoundReference) -> None: assert HourTransform().project( "name", BoundIn( @@ -856,7 +857,7 @@ def test_projection_hour_set_in(bound_reference_timestamp: BoundReference[int]) ) == In(term="name", literals={463249, 463250}) -def test_projection_hour_set_not_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_hour_set_not_in(bound_reference_timestamp: BoundReference) -> None: assert ( HourTransform().project( "name", @@ -869,17 +870,17 @@ def test_projection_hour_set_not_in(bound_reference_timestamp: BoundReference[in ) -def test_projection_identity_unary(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_identity_unary(bound_reference_timestamp: BoundReference) -> None: assert IdentityTransform().project("name", BoundNotNull(term=bound_reference_timestamp)) == NotNull(term="name") -def test_projection_identity_literal(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_identity_literal(bound_reference_timestamp: BoundReference) -> None: assert IdentityTransform().project( "name", BoundEqualTo(term=bound_reference_timestamp, literal=TimestampLiteral(TIMESTAMP_EXAMPLE)) ) == EqualTo(term="name", literal=TimestampLiteral(TIMESTAMP_EXAMPLE)) -def test_projection_identity_set_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_identity_set_in(bound_reference_timestamp: BoundReference) -> None: assert IdentityTransform().project( "name", BoundIn( @@ -892,7 +893,7 @@ def test_projection_identity_set_in(bound_reference_timestamp: BoundReference[in ) -def test_projection_identity_set_not_in(bound_reference_timestamp: BoundReference[int]) -> None: +def test_projection_identity_set_not_in(bound_reference_timestamp: BoundReference) -> None: assert IdentityTransform().project( "name", BoundNotIn( @@ -905,78 +906,78 @@ def test_projection_identity_set_not_in(bound_reference_timestamp: BoundReferenc ) -def test_projection_truncate_string_unary(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_unary(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project("name", BoundNotNull(term=bound_reference_str)) == NotNull(term="name") -def test_projection_truncate_string_literal_eq(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_literal_eq(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project("name", BoundEqualTo(term=bound_reference_str, literal=literal("data"))) == EqualTo( term="name", literal=literal("da") ) -def test_projection_truncate_string_literal_gt(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_literal_gt(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThan(term=bound_reference_str, literal=literal("data")) ) == GreaterThanOrEqual(term="name", literal=literal("da")) -def test_projection_truncate_string_literal_gte(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_literal_gte(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThanOrEqual(term=bound_reference_str, literal=literal("data")) ) == GreaterThanOrEqual(term="name", literal=literal("da")) -def test_projection_truncate_string_literal_lt(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_literal_lt(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundLessThan(term=bound_reference_str, literal=literal("data")) ) == LessThanOrEqual(term="name", literal=literal("da")) -def test_projection_truncate_string_literal_lte(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_literal_lte(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundLessThanOrEqual(term=bound_reference_str, literal=literal("data")) ) == LessThanOrEqual(term="name", literal=literal("da")) -def test_projection_truncate_string_set_same_result(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_set_same_result(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("helloworld")}) ) == EqualTo(term="name", literal=literal("he")) -def test_projection_truncate_string_set_in(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_set_in(bound_reference_str: BoundReference) -> None: assert TruncateTransform(3).project( "name", BoundIn(term=bound_reference_str, literals={literal("hello"), literal("world")}) ) == In(term="name", literals={literal("hel"), literal("wor")}) # codespell:ignore hel -def test_projection_truncate_string_set_not_in(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_set_not_in(bound_reference_str: BoundReference) -> None: assert ( TruncateTransform(3).project("name", BoundNotIn(term=bound_reference_str, literals={literal("hello"), literal("world")})) is None ) -def test_projection_truncate_decimal_literal_eq(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_decimal_literal_eq(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundEqualTo(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == EqualTo(term="name", literal=Decimal("19.24")) -def test_projection_truncate_decimal_literal_gt(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_decimal_literal_gt(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThan(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == GreaterThanOrEqual(term="name", literal=Decimal("19.26")) -def test_projection_truncate_decimal_literal_gte(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_decimal_literal_gte(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThanOrEqual(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == GreaterThanOrEqual(term="name", literal=Decimal("19.24")) -def test_projection_truncate_decimal_in(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_decimal_in(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundIn(term=bound_reference_decimal, literals={literal(Decimal(19.25)), literal(Decimal(18.15))}) ) == In( @@ -988,25 +989,25 @@ def test_projection_truncate_decimal_in(bound_reference_decimal: BoundReference[ ) -def test_projection_truncate_long_literal_eq(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_long_literal_eq(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundEqualTo(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == EqualTo(term="name", literal=Decimal("19.24")) -def test_projection_truncate_long_literal_gt(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_long_literal_gt(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThan(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == GreaterThanOrEqual(term="name", literal=Decimal("19.26")) -def test_projection_truncate_long_literal_gte(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_long_literal_gte(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundGreaterThanOrEqual(term=bound_reference_decimal, literal=DecimalLiteral(Decimal(19.25))) ) == GreaterThanOrEqual(term="name", literal=Decimal("19.24")) -def test_projection_truncate_long_in(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_projection_truncate_long_in(bound_reference_decimal: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundIn(term=bound_reference_decimal, literals={DecimalLiteral(Decimal(19.25)), DecimalLiteral(Decimal(18.15))}) ) == In( @@ -1018,19 +1019,19 @@ def test_projection_truncate_long_in(bound_reference_decimal: BoundReference[Dec ) -def test_projection_truncate_string_starts_with(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_starts_with(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundStartsWith(term=bound_reference_str, literal=literal("hello")) ) == StartsWith(term="name", literal=literal("he")) -def test_projection_truncate_string_not_starts_with(bound_reference_str: BoundReference[str]) -> None: +def test_projection_truncate_string_not_starts_with(bound_reference_str: BoundReference) -> None: assert TruncateTransform(2).project( "name", BoundNotStartsWith(term=bound_reference_str, literal=literal("hello")) ) == NotStartsWith(term="name", literal=literal("he")) -def _test_projection(lhs: UnboundPredicate[L] | None, rhs: UnboundPredicate[L] | None) -> None: +def _test_projection(lhs: UnboundPredicate | None, rhs: UnboundPredicate | None) -> None: assert type(lhs) is type(lhs), f"Different classes: {type(lhs)} != {type(rhs)}" if lhs is None and rhs is None: # Both null @@ -1068,7 +1069,7 @@ def _assert_projection_strict( assert actual_human_str == expected_human_str -def test_month_projection_strict_epoch(bound_reference_date: BoundReference[int]) -> None: +def test_month_projection_strict_epoch(bound_reference_date: BoundReference) -> None: date = literal("1970-01-01").to(DateType()) transform = MonthTransform() _assert_projection_strict(BoundLessThan(term=bound_reference_date, literal=date), transform, LessThan, "1970-01") @@ -1091,7 +1092,7 @@ def test_month_projection_strict_epoch(bound_reference_date: BoundReference[int] ) -def test_month_projection_strict_lower_bound(bound_reference_date: BoundReference[int]) -> None: +def test_month_projection_strict_lower_bound(bound_reference_date: BoundReference) -> None: date = literal("2017-01-01").to(DateType()) # == 564 months since epoch transform = MonthTransform() @@ -1115,7 +1116,7 @@ def test_month_projection_strict_lower_bound(bound_reference_date: BoundReferenc ) -def test_negative_month_projection_strict_lower_bound(bound_reference_date: BoundReference[int]) -> None: +def test_negative_month_projection_strict_lower_bound(bound_reference_date: BoundReference) -> None: date = literal("1969-01-01").to(DateType()) # == 564 months since epoch transform = MonthTransform() @@ -1140,7 +1141,7 @@ def test_negative_month_projection_strict_lower_bound(bound_reference_date: Boun ) -def test_month_projection_strict_upper_bound(bound_reference_date: BoundReference[int]) -> None: +def test_month_projection_strict_upper_bound(bound_reference_date: BoundReference) -> None: date = literal("2017-12-31").to(DateType()) # == 564 months since epoch transform = MonthTransform() @@ -1164,7 +1165,7 @@ def test_month_projection_strict_upper_bound(bound_reference_date: BoundReferenc ) -def test_negative_month_projection_strict_upper_bound(bound_reference_date: BoundReference[int]) -> None: +def test_negative_month_projection_strict_upper_bound(bound_reference_date: BoundReference) -> None: date = literal("1969-12-31").to(DateType()) # == 564 months since epoch transform = MonthTransform() @@ -1188,7 +1189,7 @@ def test_negative_month_projection_strict_upper_bound(bound_reference_date: Boun ) -def test_day_strict(bound_reference_date: BoundReference[int]) -> None: +def test_day_strict(bound_reference_date: BoundReference) -> None: date = literal("2017-01-01").to(DateType()) transform = DayTransform() @@ -1216,7 +1217,7 @@ def test_day_strict(bound_reference_date: BoundReference[int]) -> None: ) -def test_day_negative_strict(bound_reference_date: BoundReference[int]) -> None: +def test_day_negative_strict(bound_reference_date: BoundReference) -> None: date = literal("1969-12-30").to(DateType()) transform = DayTransform() @@ -1244,7 +1245,7 @@ def test_day_negative_strict(bound_reference_date: BoundReference[int]) -> None: ) -def test_year_strict_lower_bound(bound_reference_date: BoundReference[int]) -> None: +def test_year_strict_lower_bound(bound_reference_date: BoundReference) -> None: date = literal("2017-01-01").to(DateType()) transform = YearTransform() @@ -1265,7 +1266,7 @@ def test_year_strict_lower_bound(bound_reference_date: BoundReference[int]) -> N ) -def test_negative_year_strict_lower_bound(bound_reference_date: BoundReference[int]) -> None: +def test_negative_year_strict_lower_bound(bound_reference_date: BoundReference) -> None: date = literal("1970-01-01").to(DateType()) transform = YearTransform() @@ -1289,7 +1290,7 @@ def test_negative_year_strict_lower_bound(bound_reference_date: BoundReference[i ) -def test_year_strict_upper_bound(bound_reference_date: BoundReference[int]) -> None: +def test_year_strict_upper_bound(bound_reference_date: BoundReference) -> None: date = literal("2017-12-31").to(DateType()) transform = YearTransform() @@ -1313,7 +1314,7 @@ def test_year_strict_upper_bound(bound_reference_date: BoundReference[int]) -> N ) -def test_negative_year_strict_upper_bound(bound_reference_date: BoundReference[int]) -> None: +def test_negative_year_strict_upper_bound(bound_reference_date: BoundReference) -> None: date = literal("2017-12-31").to(DateType()) transform = YearTransform() @@ -1330,7 +1331,7 @@ def test_negative_year_strict_upper_bound(bound_reference_date: BoundReference[i _assert_projection_strict(BoundIn(term=bound_reference_date, literals={date, another_date}), transform, NotIn) -def test_strict_bucket_integer(bound_reference_int: BoundReference[int]) -> None: +def test_strict_bucket_integer(bound_reference_int: BoundReference) -> None: value = literal(100).to(IntegerType()) transform = BucketTransform(num_buckets=10) @@ -1346,7 +1347,7 @@ def test_strict_bucket_integer(bound_reference_int: BoundReference[int]) -> None _assert_projection_strict(BoundIn(term=bound_reference_int, literals=literals), transform, AlwaysFalse) -def test_strict_bucket_long(bound_reference_long: BoundReference[int]) -> None: +def test_strict_bucket_long(bound_reference_long: BoundReference) -> None: value = literal(100).to(LongType()) transform = BucketTransform(num_buckets=10) @@ -1362,7 +1363,7 @@ def test_strict_bucket_long(bound_reference_long: BoundReference[int]) -> None: _assert_projection_strict(BoundIn(term=bound_reference_long, literals=literals), transform, AlwaysFalse) -def test_strict_bucket_decimal(bound_reference_decimal: BoundReference[int]) -> None: +def test_strict_bucket_decimal(bound_reference_decimal: BoundReference) -> None: dec = DecimalType(9, 2) value = literal("100.00").to(dec) transform = BucketTransform(num_buckets=10) @@ -1379,7 +1380,7 @@ def test_strict_bucket_decimal(bound_reference_decimal: BoundReference[int]) -> _assert_projection_strict(BoundIn(term=bound_reference_decimal, literals=literals), transform, AlwaysFalse) -def test_strict_bucket_string(bound_reference_str: BoundReference[int]) -> None: +def test_strict_bucket_string(bound_reference_str: BoundReference) -> None: value = literal("abcdefg").to(StringType()) transform = BucketTransform(num_buckets=10) @@ -1395,7 +1396,7 @@ def test_strict_bucket_string(bound_reference_str: BoundReference[int]) -> None: _assert_projection_strict(BoundIn(term=bound_reference_str, literals={value, other_value}), transform, AlwaysFalse) -def test_strict_bucket_bytes(bound_reference_binary: BoundReference[int]) -> None: +def test_strict_bucket_bytes(bound_reference_binary: BoundReference) -> None: value = literal(str.encode("abcdefg")).to(BinaryType()) transform = BucketTransform(num_buckets=10) @@ -1411,7 +1412,7 @@ def test_strict_bucket_bytes(bound_reference_binary: BoundReference[int]) -> Non _assert_projection_strict(BoundIn(term=bound_reference_binary, literals={value, other_value}), transform, AlwaysFalse) -def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: +def test_strict_bucket_uuid(bound_reference_uuid: BoundReference) -> None: value = literal("00000000-0000-007b-0000-0000000001c8").to(UUIDType()) transform = BucketTransform(num_buckets=10) @@ -1427,7 +1428,7 @@ def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: _assert_projection_strict(BoundIn(term=bound_reference_uuid, literals={value, other_value}), transform, AlwaysFalse) -def test_strict_identity_projection(bound_reference_long: BoundReference[int]) -> None: +def test_strict_identity_projection(bound_reference_long: BoundReference) -> None: transform: Transform[Any, Any] = IdentityTransform() predicates = [ BoundNotNull(term=bound_reference_long), @@ -1458,7 +1459,7 @@ def test_strict_identity_projection(bound_reference_long: BoundReference[int]) - ) -def test_truncate_strict_integer_lower_bound(bound_reference_int: BoundReference[int]) -> None: +def test_truncate_strict_integer_lower_bound(bound_reference_int: BoundReference) -> None: value = literal(100).to(IntegerType()) transform = TruncateTransform(10) @@ -1476,7 +1477,7 @@ def test_truncate_strict_integer_lower_bound(bound_reference_int: BoundReference _assert_projection_strict(BoundIn(term=bound_reference_int, literals={value_dec, value, value_inc}), transform, NotIn) -def test_truncate_strict_integer_upper_bound(bound_reference_int: BoundReference[int]) -> None: +def test_truncate_strict_integer_upper_bound(bound_reference_int: BoundReference) -> None: value = literal(99).to(IntegerType()) transform = TruncateTransform(10) @@ -1492,7 +1493,7 @@ def test_truncate_strict_integer_upper_bound(bound_reference_int: BoundReference _assert_projection_strict(BoundIn(term=bound_reference_int, literals=literals), transform, NotIn) -def test_truncate_strict_long_lower_bound(bound_reference_long: BoundReference[int]) -> None: +def test_truncate_strict_long_lower_bound(bound_reference_long: BoundReference) -> None: value = literal(100).to(IntegerType()) transform = TruncateTransform(10) @@ -1510,7 +1511,7 @@ def test_truncate_strict_long_lower_bound(bound_reference_long: BoundReference[i _assert_projection_strict(BoundIn(term=bound_reference_long, literals={value_dec, value, value_inc}), transform, NotIn) -def test_truncate_strict_long_upper_bound(bound_reference_long: BoundReference[int]) -> None: +def test_truncate_strict_long_upper_bound(bound_reference_long: BoundReference) -> None: value = literal(99).to(IntegerType()) transform = TruncateTransform(10) @@ -1528,7 +1529,7 @@ def test_truncate_strict_long_upper_bound(bound_reference_long: BoundReference[i _assert_projection_strict(BoundIn(term=bound_reference_long, literals={value_dec, value, value_inc}), transform, NotIn) -def test_truncate_strict_decimal_lower_bound(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_truncate_strict_decimal_lower_bound(bound_reference_decimal: BoundReference) -> None: dec = DecimalType(9, 2) value = literal("100.00").to(dec) transform = TruncateTransform(10) @@ -1549,7 +1550,7 @@ def test_truncate_strict_decimal_lower_bound(bound_reference_decimal: BoundRefer _assert_projection_strict(BoundIn(term=bound_reference_decimal, literals=literals), transform, NotIn) -def test_truncate_strict_decimal_upper_bound(bound_reference_decimal: BoundReference[Decimal]) -> None: +def test_truncate_strict_decimal_upper_bound(bound_reference_decimal: BoundReference) -> None: dec = DecimalType(9, 2) value = literal("99.99").to(dec) transform = TruncateTransform(10) @@ -1570,7 +1571,7 @@ def test_truncate_strict_decimal_upper_bound(bound_reference_decimal: BoundRefer _assert_projection_strict(BoundIn(term=bound_reference_decimal, literals=literals), transform, NotIn) -def test_string_strict(bound_reference_str: BoundReference[str]) -> None: +def test_string_strict(bound_reference_str: BoundReference) -> None: value = literal("abcdefg").to(StringType()) transform: Transform[Any, Any] = TruncateTransform(width=5) @@ -1585,7 +1586,7 @@ def test_string_strict(bound_reference_str: BoundReference[str]) -> None: _assert_projection_strict(BoundIn(term=bound_reference_str, literals={value, other_value}), transform, NotIn) -def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: +def test_strict_binary(bound_reference_binary: BoundReference) -> None: value = literal(b"abcdefg").to(BinaryType()) transform: Transform[Any, Any] = TruncateTransform(width=5) diff --git a/tests/test_types.py b/tests/test_types.py index 6d671e951f..707deb160e 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=W0123,W0613 import pickle -from typing import Type import pydantic_core import pytest @@ -79,7 +78,7 @@ @pytest.mark.parametrize("input_index, input_type", non_parameterized_types) -def test_repr_primitive_types(input_index: int, input_type: Type[PrimitiveType]) -> None: +def test_repr_primitive_types(input_index: int, input_type: type[PrimitiveType]) -> None: assert isinstance(eval(repr(input_type())), input_type) assert input_type == pickle.loads(pickle.dumps(input_type)) @@ -273,7 +272,7 @@ def test_nested_field_primitive_type_as_str() -> None: @pytest.mark.parametrize("input_index,input_type", non_parameterized_types) @pytest.mark.parametrize("check_index,check_type", non_parameterized_types) def test_non_parameterized_type_equality( - input_index: int, input_type: Type[PrimitiveType], check_index: int, check_type: Type[PrimitiveType] + input_index: int, input_type: type[PrimitiveType], check_index: int, check_type: type[PrimitiveType] ) -> None: if input_index == check_index: assert input_type() == check_type() diff --git a/tests/utils/test_bin_packing.py b/tests/utils/test_bin_packing.py index 3bfacdf481..add6e56156 100644 --- a/tests/utils/test_bin_packing.py +++ b/tests/utils/test_bin_packing.py @@ -16,7 +16,6 @@ # under the License. import random -from typing import List import pytest @@ -38,11 +37,11 @@ ), # sparse ], ) -def test_bin_packing(splits: List[int], lookback: int, split_size: int, open_cost: int) -> None: +def test_bin_packing(splits: list[int], lookback: int, split_size: int, open_cost: int) -> None: def weight_func(x: int) -> int: return max(x, open_cost) - item_list_sums: List[int] = [sum(item) for item in PackingIterator(splits, split_size, lookback, weight_func)] + item_list_sums: list[int] = [sum(item) for item in PackingIterator(splits, split_size, lookback, weight_func)] assert all(split_size >= item_sum >= 0 for item_sum in item_list_sums) @@ -80,7 +79,7 @@ def weight_func(x: int) -> int: ], ) def test_bin_packing_lookback( - splits: List[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: List[List[int]] + splits: list[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: list[list[int]] ) -> None: def weight_func(x: int) -> int: return x @@ -123,7 +122,7 @@ def weight_func(x: int) -> int: ], ) def test_reverse_bin_packing_lookback( - splits: List[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: List[List[int]] + splits: list[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: list[list[int]] ) -> None: packer: ListPacker[int] = ListPacker(target_weight, lookback, largest_bin_first) result = packer.pack_end(splits, lambda x: x) diff --git a/tests/utils/test_concurrent.py b/tests/utils/test_concurrent.py index 48039e0c24..ca82a9dc36 100644 --- a/tests/utils/test_concurrent.py +++ b/tests/utils/test_concurrent.py @@ -17,15 +17,15 @@ import multiprocessing import os +from collections.abc import Generator from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor -from typing import Dict, Generator from unittest import mock import pytest from pyiceberg.utils.concurrent import ExecutorFactory -EMPTY_ENV: Dict[str, str | None] = {} +EMPTY_ENV: dict[str, str | None] = {} VALID_ENV = {"PYICEBERG_MAX_WORKERS": "5"} INVALID_ENV = {"PYICEBERG_MAX_WORKERS": "invalid"} diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 8953754103..5cd6a7203a 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import os -from typing import Any, Dict +from typing import Any from unittest import mock import pytest @@ -148,7 +148,7 @@ def test_from_configuration_files_get_typed_value(tmp_path_factory: pytest.TempP def test_config_lookup_order( monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory, - config_setup: Dict[str, Any], + config_setup: dict[str, Any], expected_result: str | None, ) -> None: """ diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index df2166fdbf..d12019c9e2 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=redefined-outer-name,arguments-renamed,fixme from tempfile import TemporaryDirectory -from typing import Dict from unittest.mock import patch import fastavro @@ -51,7 +50,7 @@ def clear_global_manifests_cache() -> None: _manifest_cache.clear() -def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: Dict[str, str]) -> None: +def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: dict[str, str]) -> None: with open(avro_file, "rb") as f: reader = fastavro.reader(f) metadata = reader.metadata diff --git a/tests/utils/test_schema_conversion.py b/tests/utils/test_schema_conversion.py index eb44dcdff3..44322849d8 100644 --- a/tests/utils/test_schema_conversion.py +++ b/tests/utils/test_schema_conversion.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=W0212 -from typing import Any, Dict +from typing import Any import pytest @@ -40,7 +40,7 @@ from pyiceberg.utils.schema_conversion import AvroSchemaConversion -def test_avro_to_iceberg(avro_schema_manifest_file_v1: Dict[str, Any]) -> None: +def test_avro_to_iceberg(avro_schema_manifest_file_v1: dict[str, Any]) -> None: iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_file_v1) expected_iceberg_schema = Schema( NestedField( @@ -377,14 +377,14 @@ def test_logical_map_with_invalid_fields() -> None: assert "Invalid key-value pair schema:" in str(exc_info.value) -def test_iceberg_to_avro_manifest_list(avro_schema_manifest_file_v1: Dict[str, Any]) -> None: +def test_iceberg_to_avro_manifest_list(avro_schema_manifest_file_v1: dict[str, Any]) -> None: """Round trip the manifest list""" iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_file_v1) avro_result = AvroSchemaConversion().iceberg_to_avro(iceberg_schema, schema_name="manifest_file") assert avro_schema_manifest_file_v1 == avro_result -def test_iceberg_to_avro_manifest(avro_schema_manifest_entry: Dict[str, Any]) -> None: +def test_iceberg_to_avro_manifest(avro_schema_manifest_entry: dict[str, Any]) -> None: """Round trip the manifest itself""" iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_entry) avro_result = AvroSchemaConversion().iceberg_to_avro(iceberg_schema, schema_name="manifest_entry") diff --git a/uv.lock b/uv.lock index 1a6f5c88e2..51f5c3c867 100644 --- a/uv.lock +++ b/uv.lock @@ -467,11 +467,11 @@ virtualenv = [ [[package]] name = "cachetools" -version = "6.2.1" +version = "6.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cc/7e/b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz", hash = "sha256:3f391e4bd8f8bf0931169baf7456cc822705f4e2a31f840d218f445b9a854201", size = 31325, upload-time = "2025-10-12T14:55:30.139Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fb/44/ca1675be2a83aeee1886ab745b28cda92093066590233cc501890eb8417a/cachetools-6.2.2.tar.gz", hash = "sha256:8e6d266b25e539df852251cfd6f990b4bc3a141db73b939058d809ebd2590fc6", size = 31571, upload-time = "2025-11-13T17:42:51.465Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl", hash = "sha256:09868944b6dde876dfd44e1d47e18484541eaf12f26f29b7af91b26cc892d701", size = 11280, upload-time = "2025-10-12T14:55:28.382Z" }, + { url = "https://files.pythonhosted.org/packages/e6/46/eb6eca305c77a4489affe1c5d8f4cae82f285d9addd8de4ec084a7184221/cachetools-6.2.2-py3-none-any.whl", hash = "sha256:6c09c98183bf58560c97b2abfcedcbaf6a896a490f534b031b661d3723b45ace", size = 11503, upload-time = "2025-11-13T17:42:50.232Z" }, ] [[package]] @@ -675,14 +675,14 @@ wheels = [ [[package]] name = "click" -version = "8.3.0" +version = "8.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] [[package]] @@ -984,45 +984,45 @@ wheels = [ [[package]] name = "cython" -version = "3.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/52/82/01f0b63287cb922e5ba96c5147c30f1e51f541ce91bd178025bb3518b1ba/cython-3.2.0.tar.gz", hash = "sha256:41fdce8237baee2d961c292ed0386903dfe126f131e450a62de0fd7a5280d4b2", size = 3267264, upload-time = "2025-11-05T13:35:04.231Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/8d/b2e9578d960d38b1b04a278bf66e13008486aa73e73967186f2015d63d1c/cython-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee408125b2d218ec7d7a061e09d24715fcab9bf7ea1a4ac01907c3f8ec8730b3", size = 2953775, upload-time = "2025-11-05T13:35:22.291Z" }, - { url = "https://files.pythonhosted.org/packages/19/dd/cfd684f98bac9e0f505af1cbb7998498c59d713275e920a72b40dab03bfa/cython-3.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c93ce307b05fcd86a5bb0e4a7d7fab238e2f0e9936636097a60bc0e21f2def30", size = 3361627, upload-time = "2025-11-05T13:35:24.519Z" }, - { url = "https://files.pythonhosted.org/packages/9c/c1/75acdbe9f6292514f0bb92ab1b78df5eedd7049235f4cbd194d2c6c46bfc/cython-3.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:191cfc2fa84642ad41a52d5abaacfb330d9a6653a465e4bf0a5681f66197a967", size = 3529751, upload-time = "2025-11-05T13:35:26.341Z" }, - { url = "https://files.pythonhosted.org/packages/f2/ce/d0468eb6d87b956902b02909f5007ad61e3839d4c07ab235b514911d869b/cython-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:a259053037ef82959b743b7fde238bd191ee43f88eb8e51101d5f3d8849f1e32", size = 2758839, upload-time = "2025-11-05T13:35:28.36Z" }, - { url = "https://files.pythonhosted.org/packages/ff/2b/904493fceda95747ba83971b40a66c8cc29ff009313429903f38ee620140/cython-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9e4b2248dc3a98b86aeba65e9862d2cc881d072c163c0fb31b511d4d72e93c8", size = 2946248, upload-time = "2025-11-05T13:35:30.406Z" }, - { url = "https://files.pythonhosted.org/packages/89/fe/abe926699fe6c580967e30bc4035da54b5e31355ba9b1f4c0cf574228a84/cython-3.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02fb4990a83d5d6f780dda18ed8baa8d587cb6523f57b4d72bc0b41ad3766c96", size = 3236384, upload-time = "2025-11-05T13:35:32.233Z" }, - { url = "https://files.pythonhosted.org/packages/1b/36/6b6266549802234286438298d494152deb19922a94928d9dcd256659ebd1/cython-3.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a98925517819d62ea25d2cf40057df60a9bcf75fdd1d6ed3882e6ae0730d82f", size = 3372915, upload-time = "2025-11-05T13:35:34.082Z" }, - { url = "https://files.pythonhosted.org/packages/29/fa/5cf15466b428f9248e38a28515cf0fd98078ae869aa395cfb300315964c4/cython-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:4c959a5d4cd6331e8498822ba47200bd2ff4bf74517c0c91475d5bc21da3b4d5", size = 2762735, upload-time = "2025-11-05T13:35:35.806Z" }, - { url = "https://files.pythonhosted.org/packages/57/d3/2e6f5f2552c860bb9c00653d092103521846114f6a2ae0648ecf84c0816c/cython-3.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:511d823d9f8a1b850178ec355d6df0a1731b9c20b08ee6d1a780f68215e9013f", size = 2959932, upload-time = "2025-11-05T13:35:37.518Z" }, - { url = "https://files.pythonhosted.org/packages/dd/bf/7bdc7f231fff6780f78586f939c1740475adecaa03bf256fcb62b2353952/cython-3.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbadeedcb2d135655bcce7380fb28c9e2a75b6810426c12b6e5a6fe6106fafb4", size = 3218588, upload-time = "2025-11-05T13:35:39.642Z" }, - { url = "https://files.pythonhosted.org/packages/be/81/7d7a81010897dc5abee59691f5fc85849dcc4c8a7687b22ed01bc8d86a7a/cython-3.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92d2394a3e3fe704210b5324eb8118333b514af72c98b1e02a6503945825b231", size = 3381940, upload-time = "2025-11-05T13:35:41.886Z" }, - { url = "https://files.pythonhosted.org/packages/4f/9d/35e7fb7b591bd9912685a772fcc773d7bb951a8feb6fb9be20addbc38928/cython-3.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:73435e56654a34ece57d4c3304a4556a8402cc4ae2d0e30f71c237a985dc5246", size = 2750886, upload-time = "2025-11-05T13:35:43.629Z" }, - { url = "https://files.pythonhosted.org/packages/5d/d0/dc4b260e8fde81b23ab4dca56948b3e69617ef470247ec6a3e09370a9849/cython-3.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d900e58e826f9a5a27b0e2b50e33473e9986a5bae375c39b0f2e19f2c545fa23", size = 2950437, upload-time = "2025-11-05T13:35:45.427Z" }, - { url = "https://files.pythonhosted.org/packages/c8/53/c322bf0486a938ad954a645866b67e978777d79183cf0a042bda6bea11de/cython-3.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9d38cd3aab720d21fa6d6ee168228352f69aea0a95bd4fb84e8879c6ed38fbb", size = 3209331, upload-time = "2025-11-05T13:35:47.278Z" }, - { url = "https://files.pythonhosted.org/packages/cd/48/55d02dba0606768d3450afd088e2bbcd6f8a54977dce041c2c3c1894631c/cython-3.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92b31d0b7b0a49b3d2aa94faaf75d44a03174cff2616b341a8853c919e511d51", size = 3370974, upload-time = "2025-11-05T13:35:49.534Z" }, - { url = "https://files.pythonhosted.org/packages/ce/bd/6dab19652b68464572b7a137d07a91ebe86db2a81c35842ff5e49ef23403/cython-3.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:2847b74e76dbad612f6fc7182c12a5f78cffb0d05808fd2c4b638cf02d1aade6", size = 2746274, upload-time = "2025-11-05T13:35:51.522Z" }, - { url = "https://files.pythonhosted.org/packages/e2/db/de5331ca6489da1761078825709257e1f24e543b4040f86a2502a4b841f9/cython-3.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a0a8274959d538d12f865193dcd67bb5630906e020190c890d2b7c13d31713c6", size = 2961164, upload-time = "2025-11-05T13:35:53.826Z" }, - { url = "https://files.pythonhosted.org/packages/54/3e/64e37e419331f7c4c540ad25c0b3e6d8f44d597f21ab8861afbc66aa7e02/cython-3.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a1c800833c25195833805c7c3626a2c30b3baaaa9ba361a1af3bbc379662a8d", size = 3249627, upload-time = "2025-11-05T13:35:55.524Z" }, - { url = "https://files.pythonhosted.org/packages/9b/fc/9faedfcc2de807f77115d97a4910c260dd4693f4fa9e0e3be0d9ae89e260/cython-3.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df15af08c21c18a2e848df5954d6fd3310735089b60405132fa4111e2cf7482a", size = 3375458, upload-time = "2025-11-05T13:35:57.279Z" }, - { url = "https://files.pythonhosted.org/packages/31/e0/30d449cd97ee0d6395aba18f2646b61b52ab3dc5a3851a346e2d363a7d85/cython-3.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:9d6876af2132757fff1b42a2f4eaa72482f991863160e3f0dc8f2c812b300ebf", size = 2783210, upload-time = "2025-11-05T13:35:59.54Z" }, - { url = "https://files.pythonhosted.org/packages/dd/6b/9e1e171fe19274465d84dffa4610d46f434b1ae945e946802db396695d67/cython-3.2.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:04821ce06598a3aa5c9e0270d98960cfe6556dedbd1418c65e4479162b8ae74a", size = 2869249, upload-time = "2025-11-05T13:36:08.944Z" }, - { url = "https://files.pythonhosted.org/packages/c4/f1/f461726f664668a96072b2a245bdfae566d68e2eb1393ec72780cc59c21e/cython-3.2.0-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:54b5b1c72a63da822b3f4739a0e31546c0a19f8e834b174906bf817ed5f9d65f", size = 3204332, upload-time = "2025-11-05T13:36:11.386Z" }, - { url = "https://files.pythonhosted.org/packages/78/d8/73c07ce64cae496e5f5a6dfe3e53574af1a8ef777e2a834d10dae8b67a4e/cython-3.2.0-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6155a6c360e32af1aaa16fa10b0119b49deeadff42a1958973324150870af1b5", size = 2851317, upload-time = "2025-11-05T13:36:13.14Z" }, - { url = "https://files.pythonhosted.org/packages/bc/d9/d9f321637b8034b5028fa5fe7d1085ffa9351fea350af6510d5cb924c014/cython-3.2.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:861258ac3878b76c57b9b5a379787d772a0bc47fec9167b43986777de542c474", size = 2987155, upload-time = "2025-11-05T13:36:15.018Z" }, - { url = "https://files.pythonhosted.org/packages/f8/b5/9f9e7d261f083b4066d734b27a7872b0c584fd4c3578196652dbf72b3f62/cython-3.2.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:85dbf955e3193893d0288105afa0fa5f4e835ff587061681f240a4f0487c44fb", size = 2884219, upload-time = "2025-11-05T13:36:17.334Z" }, - { url = "https://files.pythonhosted.org/packages/88/64/5aeb6e43e0ded9efedc5a516f87a487fdca8e434491cc352e5a805380459/cython-3.2.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3b3f13822526726bac43275c0e92916bbcc2c30e9f559edc4c1132670b70498d", size = 3218067, upload-time = "2025-11-05T13:36:19.493Z" }, - { url = "https://files.pythonhosted.org/packages/c4/a0/1958f54cd79d8251a330b9c9652b2a5ceba6a3fcec10782dd03e2a23c74f/cython-3.2.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ab18d09673d219008be5b6174bcbb6dbfd50904e66371f104a8a4698b791472d", size = 3108277, upload-time = "2025-11-05T13:36:21.203Z" }, - { url = "https://files.pythonhosted.org/packages/9c/84/9b8112160cab922b97edef00616ed18771567d88b5ba9d30d1736880c345/cython-3.2.0-cp39-abi3-win32.whl", hash = "sha256:c9fd986413fc52929b916187630a9abab9f876299951488c4b905ad5346afee6", size = 2430852, upload-time = "2025-11-05T13:36:23.049Z" }, - { url = "https://files.pythonhosted.org/packages/8f/57/65d3de140b51c45dd6892846bfabdfaaa032e2418f1cb1a2f46058c1fe42/cython-3.2.0-cp39-abi3-win_arm64.whl", hash = "sha256:ee2ea79ddeb721f912e7efea039b9db059c81767ff04fbf9a995f64e1187df99", size = 2435793, upload-time = "2025-11-05T13:36:25.139Z" }, - { url = "https://files.pythonhosted.org/packages/20/58/1f798ddb7fe6bfddf85f4f97d2d4ad63a491a7b643e85c1e274d0f09138e/cython-3.2.0-py3-none-any.whl", hash = "sha256:73f7f4c75acde5b5b4df05b11fdc2705ec637b99241d1bc2f4ebf345f7a2ea90", size = 1252818, upload-time = "2025-11-05T13:35:00.391Z" }, +version = "3.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/36/cce2972e13e83ffe58bc73bfd9d37340b5e5113e8243841a57511c7ae1c2/cython-3.2.1.tar.gz", hash = "sha256:2be1e4d0cbdf7f4cd4d9b8284a034e1989b59fd060f6bd4d24bf3729394d2ed8", size = 3270455, upload-time = "2025-11-12T19:02:59.847Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/74/f9fe9e7034f24aef407e7816880c012d8e863bedaa6b42b9ff33e79ea139/cython-3.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f1d10b3731171a33563ba81fdcba39c229e45087269dfbe07a1c00e7dcb2537f", size = 2957374, upload-time = "2025-11-12T19:03:10.132Z" }, + { url = "https://files.pythonhosted.org/packages/65/47/f9dd519117f520aaf4d723c88fd9e9139262a0379edc01e71a1e9825e082/cython-3.2.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92b814b6066d178a5057b557d372e2a03854e947e41cb9dec21db732fbd14c3c", size = 3366838, upload-time = "2025-11-12T19:03:11.742Z" }, + { url = "https://files.pythonhosted.org/packages/5d/3e/d967acfafef00056c3ba832692b9bb358ede2919f641e4a2d24828adacc6/cython-3.2.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9fc6abd0532007827d8c6143b2bfedf80c7cb89a3c1c12f058336663489ed2e", size = 3535901, upload-time = "2025-11-12T19:03:13.545Z" }, + { url = "https://files.pythonhosted.org/packages/68/79/bc46e714ecb010f80a8aa7f7eaf412c53cbabbe7489590d6aba5f4478ba5/cython-3.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:14f1ed135347587cfddcd3c3219667cac4f0ea0b66aa1c4c0187d50a1b92c222", size = 2764043, upload-time = "2025-11-12T19:03:15.584Z" }, + { url = "https://files.pythonhosted.org/packages/48/d4/ba7b9f341ec168de78bd659600e04bb7de3b2d069bf98b2178a135e88ea4/cython-3.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cb32c650e7f4476941d1f735cae75a2067d5e3279576273bb8802e8ea907222", size = 2949720, upload-time = "2025-11-12T19:03:17.492Z" }, + { url = "https://files.pythonhosted.org/packages/ad/47/c42417f424c0b928361f48d7dd0ae72716ee21f647b73ceb16f66b98663e/cython-3.2.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a2b306813d7f28aa0a2c3e4e63ada1427a8109917532df942cd5429db228252", size = 3242127, upload-time = "2025-11-12T19:03:19.227Z" }, + { url = "https://files.pythonhosted.org/packages/e6/fc/1040460889129551649ec35be45e05169871fbcf71bd8e13c533e86f9468/cython-3.2.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0959d9a36d4f004ce63acc1474b3c606745af98b65e8ae709efd0c10988e9d6b", size = 3377094, upload-time = "2025-11-12T19:03:21.25Z" }, + { url = "https://files.pythonhosted.org/packages/f8/f2/8c754298eefa40e21af0ae3592837c6e71254900d5aea1c8859e96b11de5/cython-3.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:60c62e734421365135cc2842013d883136054a26c617c001be494235edfc447a", size = 2767824, upload-time = "2025-11-12T19:03:23.317Z" }, + { url = "https://files.pythonhosted.org/packages/ee/0e/19d5041b87f98ed19c94c388607cd27c1f7458078c3bad5de2dead55b2e1/cython-3.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ea5097d97afd2ab14e98637b7033eba5146de29a5dedf89f5e946076396ab891", size = 2966736, upload-time = "2025-11-12T19:03:25.064Z" }, + { url = "https://files.pythonhosted.org/packages/84/b8/bcc36d9d2464348106984956608a52a42a01ab44ea64031207dffdebc078/cython-3.2.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4bf12de0475bb6a21e2336a4a04dc4a2b4dd0507a2a3c703e045f3484266605", size = 3221633, upload-time = "2025-11-12T19:03:26.754Z" }, + { url = "https://files.pythonhosted.org/packages/79/20/7d4807fe4ebcef9f20f2e5f93312d0f5d02f9f76524fd4e37706d04e83f7/cython-3.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18c64a0f69a1b8164de70ec7efc72250c589fec21519170de21582300f6aaed9", size = 3389542, upload-time = "2025-11-12T19:03:28.656Z" }, + { url = "https://files.pythonhosted.org/packages/2a/92/b06ba6721299293bc41e89732070132c453bdbaaeabb8f8cc76851b75345/cython-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:5ba14907d5826d8010e82306ce279a0d3650f5b50a4813c80836a17b2213c520", size = 2755307, upload-time = "2025-11-12T19:03:30.684Z" }, + { url = "https://files.pythonhosted.org/packages/40/28/c6e36c214baeb27ae45b518552e74457536c7c964b1a55b5900b047fa467/cython-3.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b4e850fc7a2f72d19679dd083fe4d20bf66860fceabb4f3207112f240249d708", size = 2957307, upload-time = "2025-11-12T19:03:32.471Z" }, + { url = "https://files.pythonhosted.org/packages/c8/c8/b0b9ba64f81f2875c42aab5c0979d6454cd1ac6b3c1e2373ad552701565d/cython-3.2.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d20ca4afe993f7dccad3aeddbf4c3536cb0fd3ad6dc7a225935a666a5655af2", size = 3210919, upload-time = "2025-11-12T19:03:34.274Z" }, + { url = "https://files.pythonhosted.org/packages/f9/33/5d9ca6abba0e77e1851b843dd1b3c4095fbc6373166935e83c4414f80e88/cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5a54a757d01ca6a260b02ce5baf17d9db1c2253566ab5844ee4966ff2a69c19", size = 3373350, upload-time = "2025-11-12T19:03:35.927Z" }, + { url = "https://files.pythonhosted.org/packages/e4/29/4408c3486ff380a2d6ae0d4b71da5195efcef3c4360017113ee7d1cb7335/cython-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:1b81e56584727a328e00d91c164f8f0f2c59b02bf6857c3f000cd830fa571453", size = 2753425, upload-time = "2025-11-12T19:03:38.157Z" }, + { url = "https://files.pythonhosted.org/packages/f0/32/c1aa03ccadda89487ff31b90d8651c3706ce2744bf4f2c2ae213147e89bd/cython-3.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7af6ad01c0fe1965d1d3badaeb6df53c1f37383ebae1ccb405b73f628f87713", size = 2967833, upload-time = "2025-11-12T19:03:40.233Z" }, + { url = "https://files.pythonhosted.org/packages/ff/dc/3488d3ade0635408a2ebb05561a3009e2f54616bfefd1f107088dfeb2c4c/cython-3.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3ea7cd085b62acb67c0fbde5cd17a7d9e47992c965e81ec977cf9ea7c59cd65", size = 3256237, upload-time = "2025-11-12T19:03:42.005Z" }, + { url = "https://files.pythonhosted.org/packages/7b/ba/f3d35d3803c9a424fa8812893847114deb9e2440c1bc67a31ab9ec4b9355/cython-3.2.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:986aea38fdf231e78d73745f83271c5654852c822dc5141a1d3fba64429a6aa6", size = 3383100, upload-time = "2025-11-12T19:03:43.675Z" }, + { url = "https://files.pythonhosted.org/packages/86/dc/d72dbb2f8e7ca95d2d18fd86f32b2e385996576230e7ecddd7d250786825/cython-3.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:4960e26cd34c1385f21646339f2e0361fcdd2ed3c01cdb50fe734add577ec56a", size = 2790322, upload-time = "2025-11-12T19:03:45.373Z" }, + { url = "https://files.pythonhosted.org/packages/5a/7e/1194f4ba98b981bbdca945a292e4f49e87ea09d69516b24445409e7cf611/cython-3.2.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:4e9167316bf6ecfea33dcca62f074605648fb93cc053ef46b5deb3e5d12fc0d3", size = 2872858, upload-time = "2025-11-12T19:03:55.074Z" }, + { url = "https://files.pythonhosted.org/packages/6b/1a/393ca8ffec7ad3f02b8e4bffaba3dba4fb62c4a1c4c0b6dbf3b80e709fe3/cython-3.2.1-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3095df6cd470064742f428c937bed7200c5123b9e19ee04aa09ec61281e565a3", size = 3209664, upload-time = "2025-11-12T19:03:56.771Z" }, + { url = "https://files.pythonhosted.org/packages/37/57/f209f64c609d3d8fac60a572e56da2f621dc1789e399c58db61d5645a31f/cython-3.2.1-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db3f53b2d9afb206075a2605f1150aa019f0733c7795a38eccc6119c2e9c3f7b", size = 2854607, upload-time = "2025-11-12T19:03:59.413Z" }, + { url = "https://files.pythonhosted.org/packages/fc/af/1e5c73fe52423f40776130b0be914fd9f9f8dc26c4f6ea4c2ed04772d558/cython-3.2.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0fc5e7687ac8f8e2b2fb95648f43e9e074ebaa72fd5cb3d8e20e5f1e8b8e02d9", size = 2991567, upload-time = "2025-11-12T19:04:02.209Z" }, + { url = "https://files.pythonhosted.org/packages/39/2c/3ea175b6b1fdfb429f9e9c395240d894155b3c0615caced05fef43264cba/cython-3.2.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:bbb3bc152bc0de82b031c8d355418fa4890a92424209d59366c2c0bc9e6cf53c", size = 2889178, upload-time = "2025-11-12T19:04:05.272Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/b2ab22a3a3feac78c62354a823c5c0c33659909e9918f53aa05904532b4b/cython-3.2.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:a2022bc48ad0c2c0e0485bf0b54902913a3d81086b7d435f4437620c667799f6", size = 3223755, upload-time = "2025-11-12T19:04:07.262Z" }, + { url = "https://files.pythonhosted.org/packages/0b/56/9ba58629a03cbffb5965a3c65ccd91fa683d95d588c21a875da72fdc249b/cython-3.2.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99fdd4ffc2dcb513f4be9ce71c6fedd895b96b1f814655b6bbab196df497b090", size = 3113456, upload-time = "2025-11-12T19:04:09.175Z" }, + { url = "https://files.pythonhosted.org/packages/56/5b/148c1a7ea5aebe460a70cad716a77e5fd0205be2de9fc5250491eb13ad8c/cython-3.2.1-cp39-abi3-win32.whl", hash = "sha256:06071f85bd5ce040464d43b2f9f287742a79f905e81b709fe904567230f1ed51", size = 2434223, upload-time = "2025-11-12T19:04:11.294Z" }, + { url = "https://files.pythonhosted.org/packages/7a/54/bb9b0c9db2a92a5e93747ca3027cfc645741411f8f1c6af2fb2a7b82df5d/cython-3.2.1-cp39-abi3-win_arm64.whl", hash = "sha256:e87c131d59480aee1ebac622b64f287c0e1d665ad1a1b7d498ac48accdb36c6b", size = 2439268, upload-time = "2025-11-12T19:04:12.931Z" }, + { url = "https://files.pythonhosted.org/packages/aa/30/373775b8d933d781d055c1dd0f110f275a101f320dab724c8c63a7c1b945/cython-3.2.1-py3-none-any.whl", hash = "sha256:cd72c46e7bffe8250c52d400e72c8d5d3086437b6aeec5b0eca99ccd337f5834", size = 1254219, upload-time = "2025-11-12T19:02:56.14Z" }, ] [[package]] name = "daft" -version = "0.6.12" +version = "0.6.14" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "fsspec" }, @@ -1031,13 +1031,13 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/47/80/0013c2c027f0637f52a115bcd82db9cc4d522af402794670b841f287a28b/daft-0.6.12.tar.gz", hash = "sha256:68ce923c56e3cdfd0423a72d9206da59898c0175ff1345d9f646edb7d1d11dcd", size = 10755323, upload-time = "2025-11-10T15:12:16.046Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/cc/1da6b22727afa04e9fb9b57e9b7c3416d1ef536d699eb79c4e358364fb39/daft-0.6.14.tar.gz", hash = "sha256:1466ae5b672ac640f1867f46e5d825ac213986cb5e2e9d14ebb04e95fea3ac63", size = 10785985, upload-time = "2025-11-17T20:02:58.572Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/91/c6/3ecd1a202965ff5c61f5a89758c91e1ae51b1a49cc48ef4ca67d60b0634c/daft-0.6.12-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7a61431cc60495d68a2777709b6bf8960eb87a52f0d1e0876f54ca9662ccf940", size = 49773785, upload-time = "2025-11-10T15:11:45.64Z" }, - { url = "https://files.pythonhosted.org/packages/9a/f8/39619a2ad521d40075e7a1f862436e2df41fa278c5559f2e6277472cb28d/daft-0.6.12-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:b57dc6c98c0c56cf4b2d2fc4dea2e740084b2feb2bcdf2a3323f6a1e87f2caad", size = 46319479, upload-time = "2025-11-10T15:11:49.469Z" }, - { url = "https://files.pythonhosted.org/packages/3d/35/6024e93ced8f15090e29cad619e0d715e31a7139ed3f67f10e1d5f861ad4/daft-0.6.12-cp310-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:7403c92f2694fb01a4956e6515ddaac637494c5f69875c19260aef19f34fc0f2", size = 47651557, upload-time = "2025-11-10T15:11:53.211Z" }, - { url = "https://files.pythonhosted.org/packages/7a/5b/54f92751a8dd3c0c4a1c58c530cb529322643a4d80a36abe1aba377228f6/daft-0.6.12-cp310-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5d15fe561a0ce4716512cc3e259b0ce1d485d8885a4abedfe62f1070b69a19bd", size = 50081130, upload-time = "2025-11-10T15:11:57.2Z" }, - { url = "https://files.pythonhosted.org/packages/d3/4c/6017ae13465cfe3bba98e890a8ee38e0c14890aed5aec216a2e8b699b808/daft-0.6.12-cp310-abi3-win_amd64.whl", hash = "sha256:03f59d3374c016680b9a0b36dfc284d20922a6e24ff6f8d3d9f76f42b69554d3", size = 48536875, upload-time = "2025-11-10T15:12:00.808Z" }, + { url = "https://files.pythonhosted.org/packages/b1/ac/367314da18ecd05c9b8567eebd9bbf1edd065b42df0cb70d20ec907d7df8/daft-0.6.14-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:56939ee5e195e36dc95659344e918876dd1101e9fde4398a44e3999741c0e9b4", size = 51908854, upload-time = "2025-11-17T20:02:27.642Z" }, + { url = "https://files.pythonhosted.org/packages/b7/c6/eac1db77a33ad760e1e8b173b0c1c97de0d638ff96266299dc70a9df33e3/daft-0.6.14-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:f9b82a095248fc2dc53b014ff99c3556fcb25a385987a28da35c235f9405632a", size = 48300243, upload-time = "2025-11-17T20:02:35.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/5f/57f670d98a4f5a879e5ad7ba7c22b875efe887ffd47c38c9d6f97c9b22f6/daft-0.6.14-cp310-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:19b029886752a24ef50f3948c541c4c4a01a87847d861ab51ea0be52e7352745", size = 49720024, upload-time = "2025-11-17T20:02:38.941Z" }, + { url = "https://files.pythonhosted.org/packages/e0/bd/ed3ae603955725be7af8575144f7e6cf2baaef12513838c783f998252161/daft-0.6.14-cp310-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:49d54e31ea587413bfc5120a28d4a023a56f22fc304c6aa1e28c888cfada54b5", size = 52240413, upload-time = "2025-11-17T20:02:42.199Z" }, + { url = "https://files.pythonhosted.org/packages/ff/2c/170a87c35bfd8390c3b52cd9828c5d5d84fc986f0db7348d05391f5e3c32/daft-0.6.14-cp310-abi3-win_amd64.whl", hash = "sha256:be4cddf55d5a49a7fe15299e01ae9d41d74472a9834d5a7f5a91517b87ab6de0", size = 50655066, upload-time = "2025-11-17T20:02:45.593Z" }, ] [[package]] @@ -1143,34 +1143,40 @@ wheels = [ [[package]] name = "duckdb" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ea/e7/21cf50a3d52ffceee1f0bcc3997fa96a5062e6bab705baee4f6c4e33cce5/duckdb-1.4.1.tar.gz", hash = "sha256:f903882f045d057ebccad12ac69975952832edfe133697694854bb784b8d6c76", size = 18461687, upload-time = "2025-10-07T10:37:28.605Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/68/cc/00a07de0e33d16763edd4132d7c8a2f9efd57a2f296a25a948f239a1fadf/duckdb-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:296b4fff3908fb4c47b0aa1d77bd1933375e75401009d2dc81af8e7a0b8a05b4", size = 29062814, upload-time = "2025-10-07T10:36:14.261Z" }, - { url = "https://files.pythonhosted.org/packages/17/ea/fb0fda8886d1928f1b2a53a1163ef94f6f4b41f6d8b29eee457acfc2fa67/duckdb-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b4182800092115feee5d71a8691efb283d3c9f5eb0b36362b308ef007a12222", size = 16161652, upload-time = "2025-10-07T10:36:17.358Z" }, - { url = "https://files.pythonhosted.org/packages/b4/5f/052e6436a71f461e61cd3a982954c029145a84b58cefa1dfb3eb2d96e4fc/duckdb-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:67cc3b6c7f7ba07a69e9331b8ccea7a60cbcd4204bb473e5da9b71588bd2eca9", size = 13753030, upload-time = "2025-10-07T10:36:19.782Z" }, - { url = "https://files.pythonhosted.org/packages/c2/fd/3ae3c89d0f6ad54c0be4430e572306fbfc9f173c97b23c5025a540449325/duckdb-1.4.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cef0cee7030b561640cb9af718f8841b19cdd2aa020d53561057b5743bea90b", size = 18487683, upload-time = "2025-10-07T10:36:22.375Z" }, - { url = "https://files.pythonhosted.org/packages/d4/3c/eef454cd7c3880c2d55b50e18a9c7a213bf91ded79efcfb573d8d6dd8a47/duckdb-1.4.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2bf93347f37a46bacce6ac859d651dbf5731e2c94a64ab358300425b09e3de23", size = 20487080, upload-time = "2025-10-07T10:36:24.692Z" }, - { url = "https://files.pythonhosted.org/packages/bb/5b/b619f4c986a1cb0b06315239da9ce5fd94a20c07a344d03e2635d56a6967/duckdb-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:2e60d2361f978908a3d96eebaf1f4b346f283afcc467351aae50ea45ca293a2b", size = 12324436, upload-time = "2025-10-07T10:36:27.458Z" }, - { url = "https://files.pythonhosted.org/packages/d9/52/606f13fa9669a24166d2fe523e28982d8ef9039874b4de774255c7806d1f/duckdb-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:605d563c1d5203ca992497cd33fb386ac3d533deca970f9dcf539f62a34e22a9", size = 29065894, upload-time = "2025-10-07T10:36:29.837Z" }, - { url = "https://files.pythonhosted.org/packages/84/57/138241952ece868b9577e607858466315bed1739e1fbb47205df4dfdfd88/duckdb-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d3305c7c4b70336171de7adfdb50431f23671c000f11839b580c4201d9ce6ef5", size = 16163720, upload-time = "2025-10-07T10:36:32.241Z" }, - { url = "https://files.pythonhosted.org/packages/a3/81/afa3a0a78498a6f4acfea75c48a70c5082032d9ac87822713d7c2d164af1/duckdb-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a063d6febbe34b32f1ad2e68822db4d0e4b1102036f49aaeeb22b844427a75df", size = 13756223, upload-time = "2025-10-07T10:36:34.673Z" }, - { url = "https://files.pythonhosted.org/packages/47/dd/5f6064fbd9248e37a3e806a244f81e0390ab8f989d231b584fb954f257fc/duckdb-1.4.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1ffcaaf74f7d1df3684b54685cbf8d3ce732781c541def8e1ced304859733ae", size = 18487022, upload-time = "2025-10-07T10:36:36.759Z" }, - { url = "https://files.pythonhosted.org/packages/a1/10/b54969a1c42fd9344ad39228d671faceb8aa9f144b67cd9531a63551757f/duckdb-1.4.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:685d3d1599dc08160e0fa0cf09e93ac4ff8b8ed399cb69f8b5391cd46b5b207c", size = 20491004, upload-time = "2025-10-07T10:36:39.318Z" }, - { url = "https://files.pythonhosted.org/packages/ed/d5/7332ae8f804869a4e895937821b776199a283f8d9fc775fd3ae5a0558099/duckdb-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:78f1d28a15ae73bd449c43f80233732adffa49be1840a32de8f1a6bb5b286764", size = 12327619, upload-time = "2025-10-07T10:36:41.509Z" }, - { url = "https://files.pythonhosted.org/packages/0e/6c/906a3fe41cd247b5638866fc1245226b528de196588802d4df4df1e6e819/duckdb-1.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cd1765a7d180b7482874586859fc23bc9969d7d6c96ced83b245e6c6f49cde7f", size = 29076820, upload-time = "2025-10-07T10:36:43.782Z" }, - { url = "https://files.pythonhosted.org/packages/66/c7/01dd33083f01f618c2a29f6dd068baf16945b8cbdb132929d3766610bbbb/duckdb-1.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8ed7a86725185470953410823762956606693c0813bb64e09c7d44dbd9253a64", size = 16167558, upload-time = "2025-10-07T10:36:46.003Z" }, - { url = "https://files.pythonhosted.org/packages/81/e2/f983b4b7ae1dfbdd2792dd31dee9a0d35f88554452cbfc6c9d65e22fdfa9/duckdb-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a189bdfc64cfb9cc1adfbe4f2dcfde0a4992ec08505ad8ce33c886e4813f0bf", size = 13762226, upload-time = "2025-10-07T10:36:48.55Z" }, - { url = "https://files.pythonhosted.org/packages/ed/34/fb69a7be19b90f573b3cc890961be7b11870b77514769655657514f10a98/duckdb-1.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9090089b6486f7319c92acdeed8acda022d4374032d78a465956f50fc52fabf", size = 18500901, upload-time = "2025-10-07T10:36:52.445Z" }, - { url = "https://files.pythonhosted.org/packages/e4/a5/1395d7b49d5589e85da9a9d7ffd8b50364c9d159c2807bef72d547f0ad1e/duckdb-1.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:142552ea3e768048e0e8c832077a545ca07792631c59edaee925e3e67401c2a0", size = 20514177, upload-time = "2025-10-07T10:36:55.358Z" }, - { url = "https://files.pythonhosted.org/packages/c0/21/08f10706d30252753349ec545833fc0cea67c11abd0b5223acf2827f1056/duckdb-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:567f3b3a785a9e8650612461893c49ca799661d2345a6024dda48324ece89ded", size = 12336422, upload-time = "2025-10-07T10:36:57.521Z" }, - { url = "https://files.pythonhosted.org/packages/d7/08/705988c33e38665c969f7876b3ca4328be578554aa7e3dc0f34158da3e64/duckdb-1.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:46496a2518752ae0c6c5d75d4cdecf56ea23dd098746391176dd8e42cf157791", size = 29077070, upload-time = "2025-10-07T10:36:59.83Z" }, - { url = "https://files.pythonhosted.org/packages/99/c5/7c9165f1e6b9069441bcda4da1e19382d4a2357783d37ff9ae238c5c41ac/duckdb-1.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1c65ae7e9b541cea07d8075343bcfebdecc29a3c0481aa6078ee63d51951cfcd", size = 16167506, upload-time = "2025-10-07T10:37:02.24Z" }, - { url = "https://files.pythonhosted.org/packages/38/46/267f4a570a0ee3ae6871ddc03435f9942884284e22a7ba9b7cb252ee69b6/duckdb-1.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:598d1a314e34b65d9399ddd066ccce1eeab6a60a2ef5885a84ce5ed62dbaf729", size = 13762330, upload-time = "2025-10-07T10:37:04.581Z" }, - { url = "https://files.pythonhosted.org/packages/15/7b/c4f272a40c36d82df20937d93a1780eb39ab0107fe42b62cba889151eab9/duckdb-1.4.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2f16b8def782d484a9f035fc422bb6f06941ed0054b4511ddcdc514a7fb6a75", size = 18504687, upload-time = "2025-10-07T10:37:06.991Z" }, - { url = "https://files.pythonhosted.org/packages/17/fc/9b958751f0116d7b0406406b07fa6f5a10c22d699be27826d0b896f9bf51/duckdb-1.4.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a7d0aed068a5c33622a8848857947cab5cfb3f2a315b1251849bac2c74c492", size = 20513823, upload-time = "2025-10-07T10:37:09.349Z" }, - { url = "https://files.pythonhosted.org/packages/30/79/4f544d73fcc0513b71296cb3ebb28a227d22e80dec27204977039b9fa875/duckdb-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:280fd663dacdd12bb3c3bf41f3e5b2e5b95e00b88120afabb8b8befa5f335c6f", size = 12336460, upload-time = "2025-10-07T10:37:12.154Z" }, +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/99/ac6c105118751cc3ccae980b12e44847273f3402e647ec3197aff2251e23/duckdb-1.4.2.tar.gz", hash = "sha256:df81acee3b15ecb2c72eb8f8579fb5922f6f56c71f5c8892ea3bc6fab39aa2c4", size = 18469786, upload-time = "2025-11-12T13:18:04.203Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/db/de454dea845f263fe42fa176c12ba9befe86a87514a2e5a48494a8ca5003/duckdb-1.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:85f0c36c1b5f378d96dd7d8c6d312317f4f547a567e8b76cacb2590a31d931f3", size = 28999618, upload-time = "2025-11-12T13:16:29.558Z" }, + { url = "https://files.pythonhosted.org/packages/1a/39/644e8b130058188a15d4e5f2b955306ee486f3843d8479da1c846a85342f/duckdb-1.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:125cd89dbfd40846f216032b11e5eeaf2be13ee4d6745b82413ddd213ddc4d99", size = 15396589, upload-time = "2025-11-12T13:16:32.161Z" }, + { url = "https://files.pythonhosted.org/packages/50/f6/11446807f06dd65227f9817e04c01309ec8009b7fe6f0cf3fc0d7f6c7ea2/duckdb-1.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c80934cb15879844a752776a1ea3d1405635f307f5bb8b87c99f5a5564d33a", size = 13726628, upload-time = "2025-11-12T13:16:34.316Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2c/6b2cf2d9df3776accb25ac375759c1d571fd730f216017c52cb5d4deffd6/duckdb-1.4.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2d3c39429b3ce1ee33d86daa94bed75a1f5b0fcf4d66d0839a6fcee398894548", size = 18455943, upload-time = "2025-11-12T13:16:36.967Z" }, + { url = "https://files.pythonhosted.org/packages/a6/b4/f213b764bd7f2c99aab20d25e4aaeda9ce54e1dc09b326c4da5a4fbe6bfd/duckdb-1.4.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4da7aafa94800f475d287814ad91993cf1f912c16f76ff4b411769da40c4b7da", size = 20454873, upload-time = "2025-11-12T13:16:39.801Z" }, + { url = "https://files.pythonhosted.org/packages/db/0d/5ae694d1779ec06beff624a5f59190c2f140e753cbdba0f5d0c7f3d44e37/duckdb-1.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:c45e0e682ee9073c36dc34d7ad8033210bfea0cab80cc98d1eca516227b35fdf", size = 12320762, upload-time = "2025-11-12T13:16:42.085Z" }, + { url = "https://files.pythonhosted.org/packages/1a/76/5b79eac0abcb239806da1d26f20515882a8392d0729a031af9e61d494dd4/duckdb-1.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b2d882672b61bc6117a2c524cf64ea519d2e829295951d214f04e126f1549b09", size = 29005908, upload-time = "2025-11-12T13:16:44.454Z" }, + { url = "https://files.pythonhosted.org/packages/73/1a/324d7787fdb0de96872ff7b48524830930494b45abf9501875be7456faa2/duckdb-1.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:995ec9c1fc3ce5fbfe5950b980ede2a9d51b35fdf2e3f873ce94c22fc3355fdc", size = 15398994, upload-time = "2025-11-12T13:16:46.802Z" }, + { url = "https://files.pythonhosted.org/packages/ad/c6/a2a072ca73f91a32c0db1254dd84fec30f4d673f9d57d853802aedf867fa/duckdb-1.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:19d2c2f3cdf0242cad42e803602bbc2636706fc1d2d260ffac815ea2e3a018e8", size = 13727492, upload-time = "2025-11-12T13:16:49.097Z" }, + { url = "https://files.pythonhosted.org/packages/d6/d5/8f84b3685a8730f47e68bce46dbce789cb85c915a8c6aafdf85830589eb3/duckdb-1.4.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a496a04458590dcec8e928122ebe2ecbb42c3e1de4119f5461f7bf547acbe79", size = 18456479, upload-time = "2025-11-12T13:16:51.66Z" }, + { url = "https://files.pythonhosted.org/packages/30/7c/709a80e72a3bf013fa890fc767d2959a8a2a15abee4088559ddabcb9399f/duckdb-1.4.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c2315b693f201787c9892f31eb9a0484d3c648edb3578a86dc8c1284dd2873a", size = 20458319, upload-time = "2025-11-12T13:16:54.24Z" }, + { url = "https://files.pythonhosted.org/packages/93/ff/e0b0dd10e6da48a262f3e054378a3781febf28af3381c0e1e901d0390b3c/duckdb-1.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:bdd2d808806ceeeec33ba89665a0bb707af8815f2ca40e6c4c581966c0628ba1", size = 12320864, upload-time = "2025-11-12T13:16:56.798Z" }, + { url = "https://files.pythonhosted.org/packages/c9/29/2f68c57e7c4242fedbf4b3fdc24fce2ffcf60640c936621d8a645593a161/duckdb-1.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9356fe17af2711e0a5ace4b20a0373e03163545fd7516e0c3c40428f44597052", size = 29015814, upload-time = "2025-11-12T13:16:59.329Z" }, + { url = "https://files.pythonhosted.org/packages/34/b7/030cc278a4ae788800a833b2901b9a7da7a6993121053c4155c359328531/duckdb-1.4.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:946a8374c0252db3fa41165ab9952b48adc8de06561a6b5fd62025ac700e492f", size = 15403892, upload-time = "2025-11-12T13:17:02.141Z" }, + { url = "https://files.pythonhosted.org/packages/f7/a2/67f4798a7a29bd0813f8a1e94a83e857e57f5d1ba14cf3edc5551aad0095/duckdb-1.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389fa9abe4ca37d091332a2f8c3ebd713f18e87dc4cb5e8efd3e5aa8ddf8885f", size = 13733622, upload-time = "2025-11-12T13:17:04.502Z" }, + { url = "https://files.pythonhosted.org/packages/6e/ac/d0d0e3feae9663334b2336f15785d280b54a56c3ffa10334e20a51a87ecd/duckdb-1.4.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be8c0c40f2264b91500b89c688f743e1c7764966e988f680b1f19416b00052e", size = 18470220, upload-time = "2025-11-12T13:17:07.049Z" }, + { url = "https://files.pythonhosted.org/packages/a5/52/7570a50430cbffc8bd702443ac28a446b0fa4f77747a3821d4b37a852b15/duckdb-1.4.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6a21732dd52a76f1e61484c06d65800b18f57fe29e8102a7466c201a2221604", size = 20481138, upload-time = "2025-11-12T13:17:09.459Z" }, + { url = "https://files.pythonhosted.org/packages/95/5e/be05f46a290ea27630c112ff9e01fd01f585e599967fc52fe2edc7bc2039/duckdb-1.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:769440f4507c20542ae2e5b87f6c6c6d3f148c0aa8f912528f6c97e9aedf6a21", size = 12330737, upload-time = "2025-11-12T13:17:12.02Z" }, + { url = "https://files.pythonhosted.org/packages/70/c4/5054dbe79cf570b0c97db0c2eba7eb541cc561037360479059a3b57e4a32/duckdb-1.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:de646227fc2c53101ac84e86e444e7561aa077387aca8b37052f3803ee690a17", size = 29015784, upload-time = "2025-11-12T13:17:14.409Z" }, + { url = "https://files.pythonhosted.org/packages/2c/b8/97f4f07d9459f5d262751cccfb2f4256debb8fe5ca92370cebe21aab1ee2/duckdb-1.4.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1fac31babda2045d4cdefe6d0fd2ebdd8d4c2a333fbcc11607cfeaec202d18d", size = 15403788, upload-time = "2025-11-12T13:17:16.864Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ea/112f33ace03682bafd4aaf0a3336da689b9834663e7032b3d678fd2902c9/duckdb-1.4.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:43ac632f40ab1aede9b4ce3c09ea043f26f3db97b83c07c632c84ebd7f7c0f4a", size = 13733603, upload-time = "2025-11-12T13:17:20.884Z" }, + { url = "https://files.pythonhosted.org/packages/34/83/8d6f845a9a946e8b47b6253b9edb084c45670763e815feed6cfefc957e89/duckdb-1.4.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77db030b48321bf785767b7b1800bf657dd2584f6df0a77e05201ecd22017da2", size = 18473725, upload-time = "2025-11-12T13:17:23.074Z" }, + { url = "https://files.pythonhosted.org/packages/82/29/153d1b4fc14c68e6766d7712d35a7ab6272a801c52160126ac7df681f758/duckdb-1.4.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a456adbc3459c9dcd99052fad20bd5f8ef642be5b04d09590376b2eb3eb84f5c", size = 20481971, upload-time = "2025-11-12T13:17:26.703Z" }, + { url = "https://files.pythonhosted.org/packages/58/b7/8d3a58b5ebfb9e79ed4030a0f2fbd7e404c52602e977b1e7ab51651816c7/duckdb-1.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:2f7c61617d2b1da3da5d7e215be616ad45aa3221c4b9e2c4d1c28ed09bc3c1c4", size = 12330535, upload-time = "2025-11-12T13:17:29.175Z" }, + { url = "https://files.pythonhosted.org/packages/25/46/0f316e4d0d6bada350b9da06691a2537c329c8948c78e8b5e0c4874bc5e2/duckdb-1.4.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:422be8c6bdc98366c97f464b204b81b892bf962abeae6b0184104b8233da4f19", size = 29028616, upload-time = "2025-11-12T13:17:31.599Z" }, + { url = "https://files.pythonhosted.org/packages/82/ab/e04a8f97865251b544aee9501088d4f0cb8e8b37339bd465c0d33857d411/duckdb-1.4.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:459b1855bd06a226a2838da4f14c8863fd87a62e63d414a7f7f682a7c616511a", size = 15410382, upload-time = "2025-11-12T13:17:34.14Z" }, + { url = "https://files.pythonhosted.org/packages/47/ec/b8229517c2f9fe88a38bb1a172a2da4d0ff34996d319d74554fda80b6358/duckdb-1.4.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20c45b4ead1ea4d23a1be1cd4f1dfc635e58b55f0dd11e38781369be6c549903", size = 13737588, upload-time = "2025-11-12T13:17:36.515Z" }, + { url = "https://files.pythonhosted.org/packages/f2/9a/63d26da9011890a5b893e0c21845c0c0b43c634bf263af3bbca64be0db76/duckdb-1.4.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e552451054534970dc999e69ca5ae5c606458548c43fb66d772117760485096", size = 18477886, upload-time = "2025-11-12T13:17:39.136Z" }, + { url = "https://files.pythonhosted.org/packages/23/35/b1fae4c5245697837f6f63e407fa81e7ccc7948f6ef2b124cd38736f4d1d/duckdb-1.4.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:128c97dab574a438d7c8d020670b21c68792267d88e65a7773667b556541fa9b", size = 20483292, upload-time = "2025-11-12T13:17:41.501Z" }, + { url = "https://files.pythonhosted.org/packages/25/5e/6f5ebaabc12c6db62f471f86b5c9c8debd57f11aa1b2acbbcc4c68683238/duckdb-1.4.2-cp314-cp314-win_amd64.whl", hash = "sha256:dfcc56a83420c0dec0b83e97a6b33addac1b7554b8828894f9d203955591218c", size = 12830520, upload-time = "2025-11-12T13:17:43.93Z" }, ] [[package]] @@ -1812,7 +1818,7 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "1.1.2" +version = "1.1.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -1826,9 +1832,9 @@ dependencies = [ { name = "typer-slim" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b8/63/eeea214a6b456d8e91ac2ea73ebb83da3af9aa64716dfb6e28dd9b2e6223/huggingface_hub-1.1.2.tar.gz", hash = "sha256:7bdafc432dc12fa1f15211bdfa689a02531d2a47a3cc0d74935f5726cdbcab8e", size = 606173, upload-time = "2025-11-06T10:04:38.398Z" } +sdist = { url = "https://files.pythonhosted.org/packages/44/8a/3cba668d9cd1b4e3eb6c1c3ff7bf0f74a7809bdbb5c327bcdbdbac802d23/huggingface_hub-1.1.4.tar.gz", hash = "sha256:a7424a766fffa1a11e4c1ac2040a1557e2101f86050fdf06627e7b74cc9d2ad6", size = 606842, upload-time = "2025-11-13T10:51:57.602Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/33/21/e15d90fd09b56938502a0348d566f1915f9789c5bb6c00c1402dc7259b6e/huggingface_hub-1.1.2-py3-none-any.whl", hash = "sha256:dfcfa84a043466fac60573c3e4af475490a7b0d7375b22e3817706d6659f61f7", size = 514955, upload-time = "2025-11-06T10:04:36.674Z" }, + { url = "https://files.pythonhosted.org/packages/33/3f/969137c9d9428ed8bf171d27604243dd950a47cac82414826e2aebbc0a4c/huggingface_hub-1.1.4-py3-none-any.whl", hash = "sha256:867799fbd2ef338b7f8b03d038d9c0e09415dfe45bb2893b48a510d1d746daa5", size = 515580, upload-time = "2025-11-13T10:51:55.742Z" }, ] [[package]] @@ -2312,7 +2318,7 @@ wheels = [ [[package]] name = "mkdocs-material" -version = "9.6.23" +version = "9.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "babel" }, @@ -2327,9 +2333,9 @@ dependencies = [ { name = "pymdown-extensions" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/57/de/cc1d5139c2782b1a49e1ed1845b3298ed6076b9ba1c740ad7c952d8ffcf9/mkdocs_material-9.6.23.tar.gz", hash = "sha256:62ebc9cdbe90e1ae4f4e9b16a6aa5c69b93474c7b9e79ebc0b11b87f9f055e00", size = 4048130, upload-time = "2025-11-01T16:33:11.782Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/3b/111b84cd6ff28d9e955b5f799ef217a17bc1684ac346af333e6100e413cb/mkdocs_material-9.7.0.tar.gz", hash = "sha256:602b359844e906ee402b7ed9640340cf8a474420d02d8891451733b6b02314ec", size = 4094546, upload-time = "2025-11-11T08:49:09.73Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f5/df/bc583e857174b0dc6df67d555123533f09e7e1ac0f3fae7693fb6840c0a3/mkdocs_material-9.6.23-py3-none-any.whl", hash = "sha256:3bf3f1d82d269f3a14ed6897bfc3a844cc05e1dc38045386691b91d7e6945332", size = 9210689, upload-time = "2025-11-01T16:33:08.196Z" }, + { url = "https://files.pythonhosted.org/packages/04/87/eefe8d5e764f4cf50ed91b943f8e8f96b5efd65489d8303b7a36e2e79834/mkdocs_material-9.7.0-py3-none-any.whl", hash = "sha256:da2866ea53601125ff5baa8aa06404c6e07af3c5ce3d5de95e3b52b80b442887", size = 9283770, upload-time = "2025-11-11T08:49:06.26Z" }, ] [[package]] @@ -2783,14 +2789,14 @@ wheels = [ [[package]] name = "mypy-boto3-glue" -version = "1.40.63" +version = "1.40.75" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/76/17/b7cc39f18c251db234b882637f8cd9730a85a3beeca144640ec7f36df2c9/mypy_boto3_glue-1.40.63.tar.gz", hash = "sha256:d67b88f976487f1adc0b1772c8ad3dd06aaae94a23046924abd4d1bb53db24b9", size = 127788, upload-time = "2025-10-30T19:45:02.474Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c1/f7/31990901ea529484d3b6733b2b6ea2ba6406fc61f91dddf9a879f7b3b34c/mypy_boto3_glue-1.40.75.tar.gz", hash = "sha256:9400cb6524a2ce00d229fd8287588e638df3283c3d19161fe729a812b29c8adc", size = 127538, upload-time = "2025-11-17T22:12:34.87Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/72/e2cb0c7005f2f9e409c7ebc02e5aaf4f3fd6d0d2213819f09fa5e42b8007/mypy_boto3_glue-1.40.63-py3-none-any.whl", hash = "sha256:695a702f7059c71a414d02618380f7d7cf3a6e90282a314c09f9cfe4a7be01f6", size = 135997, upload-time = "2025-10-30T19:45:00.36Z" }, + { url = "https://files.pythonhosted.org/packages/aa/74/4900df37cf4df01d646ab7fdf54b64b5fced70e7d2564c4cd07354eaecb6/mypy_boto3_glue-1.40.75-py3-none-any.whl", hash = "sha256:50d12feab27289f7dd77902810bcd9b360c1d4fb4e260b2650268fe6e100608f", size = 136034, upload-time = "2025-11-17T22:12:32.347Z" }, ] [[package]] @@ -3113,28 +3119,28 @@ wheels = [ [[package]] name = "prek" -version = "0.2.13" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f8/57/a0d9d2d39c8477ae44096f9c8b962395428309019875d65e130f60478015/prek-0.2.13.tar.gz", hash = "sha256:eca64c201938cd71ca09eec9b3e31ad031a251f4aa22a5132eb1c1640d86114f", size = 341674, upload-time = "2025-11-04T14:16:53.338Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/29/64/51372417ffdbef0cc41ed2fa22be064fe247246fff364dbc86ac43364fe2/prek-0.2.13-py3-none-linux_armv6l.whl", hash = "sha256:9f55198e4b0f08c544e790184898459973755d5b83eb21e3527d789f3fc6855e", size = 4480650, upload-time = "2025-11-04T14:16:20.351Z" }, - { url = "https://files.pythonhosted.org/packages/80/f3/6375bd1b4e4786d26329613a115284ec9d363ccf366236764c168a81a2cf/prek-0.2.13-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a8724e40242a4ad3cb3f14d17764c287d9b0a5ea61ac91a861e5a676bfe21c99", size = 4598486, upload-time = "2025-11-04T14:16:22.391Z" }, - { url = "https://files.pythonhosted.org/packages/c7/9a/75770f16044b6cf1f07bdaa9a7966adbf09919531e86d890964799194397/prek-0.2.13-py3-none-macosx_11_0_arm64.whl", hash = "sha256:dd5e0ddc4e4c72633ee7d400c3be7df53b7bb0e61ba7c9ea4cb490d476824f79", size = 4279103, upload-time = "2025-11-04T14:16:23.965Z" }, - { url = "https://files.pythonhosted.org/packages/6b/05/91c44cb60758d3c781ea057b70b59e1870c667cfb64d12e636f54d364538/prek-0.2.13-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:8a869f9c98da675efc947e0aa4c7a8d8434d57fa5bad20e27e54855160771053", size = 4466156, upload-time = "2025-11-04T14:16:25.425Z" }, - { url = "https://files.pythonhosted.org/packages/0c/7c/41ada766ca3a5cd5f286222ae30b909edac5cb95f076f6af2ff40f3cc3b6/prek-0.2.13-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0f5eac126203c89e38062e665f1427207dece178f9e19bacbc37056ab8504148", size = 4422162, upload-time = "2025-11-04T14:16:27.586Z" }, - { url = "https://files.pythonhosted.org/packages/e9/ee/1017ae7207e3542a7ecfbabf2dab910c7691996956fcc124f94c1219ae1c/prek-0.2.13-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cea9a0a46fc5e62ebf4cde1e700993737b6828228cf60f1ccfa1c91092e7e7f", size = 4716470, upload-time = "2025-11-04T14:16:29.401Z" }, - { url = "https://files.pythonhosted.org/packages/22/4a/ff74278b426cda5dee5d6280dc87fd47f428693d3fadb98a5fbfb0f2b329/prek-0.2.13-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dde77f9ee904df60fa9cecfbfde04257d26c3f5d0c5ee55211738b3990ff740f", size = 5130294, upload-time = "2025-11-04T14:16:30.882Z" }, - { url = "https://files.pythonhosted.org/packages/b3/91/e4e6eddaae3bb794ca1f98948e10c8137da7523a0d728253522a1e4a83f6/prek-0.2.13-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ae60b5d753272c1051667a257cbb13cfb197ef32900aee4cefa98352d5e7576", size = 5071134, upload-time = "2025-11-04T14:16:32.54Z" }, - { url = "https://files.pythonhosted.org/packages/8d/4f/2d3d707d93bf4f48b8b95d4bcafe90df7d4f99b147442b7bbaac5e7f9838/prek-0.2.13-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7cb2a54d55c35b57548fc1cb74fb66126ed1437a06a33416c6484e0eb4dd80e", size = 5166984, upload-time = "2025-11-04T14:16:34.978Z" }, - { url = "https://files.pythonhosted.org/packages/61/e5/b44f3ee2d75f1c0a2a844e3a0980ba712bca5adefb4c65942f85e22156eb/prek-0.2.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7d0c1b1d6e315dd51cdb4e68957d5ef38a67e2c5f0dab53168cb6e842539bbf", size = 4777513, upload-time = "2025-11-04T14:16:36.881Z" }, - { url = "https://files.pythonhosted.org/packages/1d/a0/0d366c35eeca0f40080f9afa4f7a0bdf4b4103a3d0ece7965a41e3e56986/prek-0.2.13-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:06dc43f2d7219f2bad2c01085ad444de517b5d28e5ef361274ff39a46b68f2cc", size = 4480175, upload-time = "2025-11-04T14:16:39.489Z" }, - { url = "https://files.pythonhosted.org/packages/5d/54/7c5dac73f2f825ef54f61b9c96f7a6c8369b17746b208b007b7e50a86a54/prek-0.2.13-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:e206979c3d1834fc1683c79e8c72b7e18be3923ca5695de0642d0c9d23e2010a", size = 4572975, upload-time = "2025-11-04T14:16:41.149Z" }, - { url = "https://files.pythonhosted.org/packages/b9/de/451fdfdb07e40274998127166683ab828d4940f16ba2f3ceaa074b2bf1ae/prek-0.2.13-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:fa3667c641b77c9cb100d428e615f69cf45018615af32b8c63bb4fa2cbbb8769", size = 4399278, upload-time = "2025-11-04T14:16:42.583Z" }, - { url = "https://files.pythonhosted.org/packages/d6/fe/47edb36d854fafcad8548fa847986cf5acb7c82c8d00ced41bd3c36a4d97/prek-0.2.13-py3-none-musllinux_1_1_i686.whl", hash = "sha256:82555ede81a7ca058ffe450e0cf7aab85db2063aeeb00d4b1704b32ccb3a4e23", size = 4614483, upload-time = "2025-11-04T14:16:45.675Z" }, - { url = "https://files.pythonhosted.org/packages/94/db/dce44c94ee50514053996f8f9b36f11b07bdf8beb67c2420b7a36b3cafb5/prek-0.2.13-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:a72f909dd737aeda07d3768aab39d8ed2574ddf05842b8a324434810d2e7160f", size = 4884485, upload-time = "2025-11-04T14:16:47.131Z" }, - { url = "https://files.pythonhosted.org/packages/bc/c4/56fa24632dac132e08ce1f2a68fb27ac4c4463a19c79e902acfbf12ecc4d/prek-0.2.13-py3-none-win32.whl", hash = "sha256:beb5cffb1575b645022305a601bdd395b5b005c42368fedb34bfc6aebed24b36", size = 4299803, upload-time = "2025-11-04T14:16:48.591Z" }, - { url = "https://files.pythonhosted.org/packages/ab/5a/0ac93e96da38a61ae1c116a819b66520cfcb252e3c790a2726a21cefbb90/prek-0.2.13-py3-none-win_amd64.whl", hash = "sha256:75fe11e6689431b5a3f818276dfbcbb3502cd2a4b07a3efaf3460204adaa9a89", size = 4881406, upload-time = "2025-11-04T14:16:50.244Z" }, - { url = "https://files.pythonhosted.org/packages/05/de/8832527ecce043cdfe28cd6a5cb580c79e64f2abe1099f87e03a71614060/prek-0.2.13-py3-none-win_arm64.whl", hash = "sha256:6b3b0e07a2da4e67e7110399b5dbd8d9205df8ff39fbe80bd37ffa194c639990", size = 4552103, upload-time = "2025-11-04T14:16:52.168Z" }, +version = "0.2.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/cf/a7d1e1ff837efcf77ee9755146e0ddaa278893b4b23c2430bac2723aad68/prek-0.2.15.tar.gz", hash = "sha256:df7687671f87512fbd2657aac901ba57ba9cf3f19266dc0a9b8812c37595bbb5", size = 340449, upload-time = "2025-11-17T10:50:23.488Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/75/a7eef800b523d662961976f75ffb5df79341d1228d16c1833e56d9d7abaa/prek-0.2.15-py3-none-linux_armv6l.whl", hash = "sha256:99f05f37a3fa464639a2813e86fc5bc66095b285d6355ff4c6a24449edfca6e0", size = 4704193, upload-time = "2025-11-17T10:49:56.138Z" }, + { url = "https://files.pythonhosted.org/packages/b3/99/e962b2353a6848d36ecc29acb63a0966b4860690f2ff28c5b1cdf0f411f2/prek-0.2.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38f4676070cc2001d3cff7a7e20dd6a26a2e6e350b5697130f6a7bdea3472116", size = 4791817, upload-time = "2025-11-17T10:49:57.771Z" }, + { url = "https://files.pythonhosted.org/packages/71/f3/42146abb8f98bded31d346550be4104dbdcf5c9894f1eca485c712b7c3c9/prek-0.2.15-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c518871cf67baaaef43ae4ca9494731a935c40668acf7594cb2026d17c11069e", size = 4518681, upload-time = "2025-11-17T10:49:59.072Z" }, + { url = "https://files.pythonhosted.org/packages/6c/33/e57f8a1e5429c123c997fb550b119f55979280afbab3caf9c62b8066212f/prek-0.2.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:355f8c6f13e28895630921aa99c56751aee4c913150ee3c2633c827670263a62", size = 4717570, upload-time = "2025-11-17T10:50:00.827Z" }, + { url = "https://files.pythonhosted.org/packages/9d/9e/913d361a917dc40eb2b508f36f853c0775b5028beba23fffbcc526acb081/prek-0.2.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a63a40aec1afee6ad7b0f7d5bc2b3c4e0c1e7128b9652721bce18deb5a66991b", size = 4634527, upload-time = "2025-11-17T10:50:02.227Z" }, + { url = "https://files.pythonhosted.org/packages/14/d2/ea96256f872772f6741927e42ed3444a3138beb17df78f9be8546538a9c9/prek-0.2.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2913159c5c58741c325e150036f78b00f645c5a9c90b4a8c138dfe8295c82d0e", size = 4918249, upload-time = "2025-11-17T10:50:04.032Z" }, + { url = "https://files.pythonhosted.org/packages/cf/b2/3a1bda0d9a0354174aecaf7052b954edaedc3c5935d24d0bc6258f03ef2f/prek-0.2.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5702d363adddb23664a00e0dbc3c6c138c9496411d83dbce29d1d2a1d1824439", size = 5348745, upload-time = "2025-11-17T10:50:05.206Z" }, + { url = "https://files.pythonhosted.org/packages/28/5d/a2c11538666389fecf41b061e579a2a70e2b830c3d5680b2258b19b1f8f5/prek-0.2.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c3a1896c18ed8172ebcc45949542a839720a62479e6039b665b01e6361bc170", size = 5298275, upload-time = "2025-11-17T10:50:06.447Z" }, + { url = "https://files.pythonhosted.org/packages/82/82/931789710ba2e1e556c111168b32e446177031383ab0307656a488ffb4f8/prek-0.2.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ada513579e6602cdd5b2279bce481e9062fa8790e695b35d7886948f519c4d9", size = 5352195, upload-time = "2025-11-17T10:50:07.742Z" }, + { url = "https://files.pythonhosted.org/packages/35/90/821a31f6561637b69ca79cb8a5ba740bd6fe0d2a02be5c5f513995adcc50/prek-0.2.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dfb436ac218668465462ee15d39d4face026ab77bd7b4eded46b252ff791897", size = 4977371, upload-time = "2025-11-17T10:50:10.283Z" }, + { url = "https://files.pythonhosted.org/packages/1b/9b/56ebb895619490b92a2ec1b6d7aee39b68b1f1a514674de5308fa63f3557/prek-0.2.15-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:5c01c9a58d08ce607644d1b19722d04ba6324264f6a85a42e78cda17bd5c9f49", size = 4727638, upload-time = "2025-11-17T10:50:12.089Z" }, + { url = "https://files.pythonhosted.org/packages/3c/90/49ed5902943d4a10585b3986b0a03a243007cbdbf23d3292d0ea16488fd4/prek-0.2.15-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:35383dd0a0d3eae7cf61894421347bb69c98b99e6d482e0e67d9203e8f1f1920", size = 4742717, upload-time = "2025-11-17T10:50:13.818Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d8/a5d0a39c84ece34fd3cb68baa7432bdf5b796875e585a68e96c47099e025/prek-0.2.15-py3-none-musllinux_1_1_armv7l.whl", hash = "sha256:59427f11eae316565f886437901b2d23753f46c3b68c294c09eba96dcd96f8e2", size = 4618345, upload-time = "2025-11-17T10:50:15.417Z" }, + { url = "https://files.pythonhosted.org/packages/4b/35/728a629be8cac74bd8b290460126841faa94c6baf91555119146cd264171/prek-0.2.15-py3-none-musllinux_1_1_i686.whl", hash = "sha256:ebbadd04a5dc650af803d1f57fa44c086cc74a6f07fb56a0543f774d5d1afea2", size = 4814225, upload-time = "2025-11-17T10:50:16.68Z" }, + { url = "https://files.pythonhosted.org/packages/b2/5b/f1456dd5169aea46b5955aa9fbd394bfac4dd973e9456d66f8eb4523cf4f/prek-0.2.15-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:6dc6f0ea37f51a198ad8735480bfb31c78237b8dac61274a69aef9bb4fc81a41", size = 5089205, upload-time = "2025-11-17T10:50:18.334Z" }, + { url = "https://files.pythonhosted.org/packages/78/86/85f6f1bc43309a951841d58b5a9061e733fbb1f752b04602b6a749cb8b7c/prek-0.2.15-py3-none-win32.whl", hash = "sha256:4d4445843cc4a5231d70ddf245cf29b8b41df58ac78580d94bc8b863f9e09e98", size = 4466042, upload-time = "2025-11-17T10:50:19.673Z" }, + { url = "https://files.pythonhosted.org/packages/35/82/6a2c64c5961c9b667695aa8a5d28f8e32089c6f7b46c53cff2cf3feebe0c/prek-0.2.15-py3-none-win_amd64.whl", hash = "sha256:2eb5b339804c7c7415a5308647dfdaf0e0763f2a12e6638d117921d97e3254cc", size = 5139035, upload-time = "2025-11-17T10:50:21.056Z" }, + { url = "https://files.pythonhosted.org/packages/53/b1/a0911fd9aeda9d0e584ed7c94a52a90339d290fa085ca7ea71584b0fb3ef/prek-0.2.15-py3-none-win_arm64.whl", hash = "sha256:95289292900b37856b563e5ec8e54e0de69f5b20309f51deefbf81be01e9a245", size = 4825577, upload-time = "2025-11-17T10:50:22.405Z" }, ] [[package]] @@ -3265,17 +3271,17 @@ wheels = [ [[package]] name = "protobuf" -version = "6.33.0" +version = "6.33.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/ff/64a6c8f420818bb873713988ca5492cba3a7946be57e027ac63495157d97/protobuf-6.33.0.tar.gz", hash = "sha256:140303d5c8d2037730c548f8c7b93b20bb1dc301be280c378b82b8894589c954", size = 443463, upload-time = "2025-10-15T20:39:52.159Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/ee/52b3fa8feb6db4a833dfea4943e175ce645144532e8a90f72571ad85df4e/protobuf-6.33.0-cp310-abi3-win32.whl", hash = "sha256:d6101ded078042a8f17959eccd9236fb7a9ca20d3b0098bbcb91533a5680d035", size = 425593, upload-time = "2025-10-15T20:39:40.29Z" }, - { url = "https://files.pythonhosted.org/packages/7b/c6/7a465f1825872c55e0341ff4a80198743f73b69ce5d43ab18043699d1d81/protobuf-6.33.0-cp310-abi3-win_amd64.whl", hash = "sha256:9a031d10f703f03768f2743a1c403af050b6ae1f3480e9c140f39c45f81b13ee", size = 436882, upload-time = "2025-10-15T20:39:42.841Z" }, - { url = "https://files.pythonhosted.org/packages/e1/a9/b6eee662a6951b9c3640e8e452ab3e09f117d99fc10baa32d1581a0d4099/protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:905b07a65f1a4b72412314082c7dbfae91a9e8b68a0cc1577515f8df58ecf455", size = 427521, upload-time = "2025-10-15T20:39:43.803Z" }, - { url = "https://files.pythonhosted.org/packages/10/35/16d31e0f92c6d2f0e77c2a3ba93185130ea13053dd16200a57434c882f2b/protobuf-6.33.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e0697ece353e6239b90ee43a9231318302ad8353c70e6e45499fa52396debf90", size = 324445, upload-time = "2025-10-15T20:39:44.932Z" }, - { url = "https://files.pythonhosted.org/packages/e6/eb/2a981a13e35cda8b75b5585aaffae2eb904f8f351bdd3870769692acbd8a/protobuf-6.33.0-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:e0a1715e4f27355afd9570f3ea369735afc853a6c3951a6afe1f80d8569ad298", size = 339159, upload-time = "2025-10-15T20:39:46.186Z" }, - { url = "https://files.pythonhosted.org/packages/21/51/0b1cbad62074439b867b4e04cc09b93f6699d78fd191bed2bbb44562e077/protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:35be49fd3f4fefa4e6e2aacc35e8b837d6703c37a2168a55ac21e9b1bc7559ef", size = 323172, upload-time = "2025-10-15T20:39:47.465Z" }, - { url = "https://files.pythonhosted.org/packages/07/d1/0a28c21707807c6aacd5dc9c3704b2aa1effbf37adebd8caeaf68b17a636/protobuf-6.33.0-py3-none-any.whl", hash = "sha256:25c9e1963c6734448ea2d308cfa610e692b801304ba0908d7bfa564ac5132995", size = 170477, upload-time = "2025-10-15T20:39:51.311Z" }, + { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" }, + { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" }, + { url = "https://files.pythonhosted.org/packages/cd/93/26213ff72b103ae55bb0d73e7fb91ea570ef407c3ab4fd2f1f27cac16044/protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:fe34575f2bdde76ac429ec7b570235bf0c788883e70aee90068e9981806f2490", size = 427522, upload-time = "2025-11-13T16:44:10.475Z" }, + { url = "https://files.pythonhosted.org/packages/c2/32/df4a35247923393aa6b887c3b3244a8c941c32a25681775f96e2b418f90e/protobuf-6.33.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:f8adba2e44cde2d7618996b3fc02341f03f5bc3f2748be72dc7b063319276178", size = 324445, upload-time = "2025-11-13T16:44:11.869Z" }, + { url = "https://files.pythonhosted.org/packages/8e/d0/d796e419e2ec93d2f3fa44888861c3f88f722cde02b7c3488fcc6a166820/protobuf-6.33.1-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:0f4cf01222c0d959c2b399142deb526de420be8236f22c71356e2a544e153c53", size = 339161, upload-time = "2025-11-13T16:44:12.778Z" }, + { url = "https://files.pythonhosted.org/packages/1d/2a/3c5f05a4af06649547027d288747f68525755de692a26a7720dced3652c0/protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:8fd7d5e0eb08cd5b87fd3df49bc193f5cfd778701f47e11d127d0afc6c39f1d1", size = 323171, upload-time = "2025-11-13T16:44:14.035Z" }, + { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" }, ] [[package]] @@ -3434,6 +3440,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, ] +[[package]] +name = "pyarrow-stubs" +version = "20.0.0.20251107" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyarrow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/f1/40fa277fe20dfc6253f8e11edb96120050174209afc84019cd52386c5769/pyarrow_stubs-20.0.0.20251107.tar.gz", hash = "sha256:c0885c09f63e2be51bacb6b0e20b39083f43da1cb214d31e406f982e874bcb5a", size = 236584, upload-time = "2025-11-07T03:46:59.872Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/fa/a8ebb2cc3a301604f5ca2628e399232ba2d53c1590c1a9b7f8695667db4e/pyarrow_stubs-20.0.0.20251107-py3-none-any.whl", hash = "sha256:09da6809f37cc6dbbbf59c9c8e42269290d19ac09f65d2b3456c671f1c3a8765", size = 235744, upload-time = "2025-11-07T03:47:00.868Z" }, +] + [[package]] name = "pyasn1" version = "0.6.1" @@ -3645,7 +3663,6 @@ datafusion = [ duckdb = [ { name = "duckdb" }, { name = "pyarrow" }, - { name = "pyiceberg-core" }, ] dynamodb = [ { name = "boto3" }, @@ -3673,7 +3690,6 @@ hive-kerberos = [ pandas = [ { name = "pandas" }, { name = "pyarrow" }, - { name = "pyiceberg-core" }, ] polars = [ { name = "polars" }, @@ -3688,7 +3704,6 @@ pyiceberg-core = [ ray = [ { name = "pandas" }, { name = "pyarrow" }, - { name = "pyiceberg-core" }, { name = "ray" }, ] rest-sigv4 = [ @@ -3720,6 +3735,7 @@ dev = [ { name = "mypy-boto3-glue" }, { name = "prek" }, { name = "protobuf" }, + { name = "pyarrow-stubs" }, { name = "pyspark", extra = ["connect"] }, { name = "pytest" }, { name = "pytest-checkdocs" }, @@ -3770,11 +3786,8 @@ requires-dist = [ { name = "pyarrow", marker = "extra == 'pyarrow'", specifier = ">=17.0.0" }, { name = "pyarrow", marker = "extra == 'ray'", specifier = ">=17.0.0" }, { name = "pydantic", specifier = ">=2.0,!=2.4.0,!=2.4.1,!=2.12.0,!=2.12.1,<3.0" }, - { name = "pyiceberg-core", marker = "extra == 'duckdb'", specifier = ">=0.5.1,<0.8.0" }, - { name = "pyiceberg-core", marker = "extra == 'pandas'", specifier = ">=0.5.1,<0.8.0" }, { name = "pyiceberg-core", marker = "extra == 'pyarrow'", specifier = ">=0.5.1,<0.8.0" }, { name = "pyiceberg-core", marker = "extra == 'pyiceberg-core'", specifier = ">=0.5.1,<0.8.0" }, - { name = "pyiceberg-core", marker = "extra == 'ray'", specifier = ">=0.5.1,<0.8.0" }, { name = "pyparsing", specifier = ">=3.1.0,<4.0.0" }, { name = "pyroaring", specifier = ">=1.0.0,<2.0.0" }, { name = "python-snappy", marker = "extra == 'snappy'", specifier = ">=0.6.0,<1.0.0" }, @@ -3797,7 +3810,7 @@ provides-extras = ["pyarrow", "pandas", "duckdb", "ray", "bodo", "daft", "polars [package.metadata.requires-dev] dev = [ { name = "coverage", extras = ["toml"], specifier = ">=7.4.2,<8" }, - { name = "cython", specifier = "==3.2.0" }, + { name = "cython", specifier = "==3.2.1" }, { name = "deptry", specifier = ">=0.14,<0.24" }, { name = "docutils", specifier = "!=0.21.post1" }, { name = "fastavro", specifier = "==1.12.1" }, @@ -3805,7 +3818,8 @@ dev = [ { name = "mypy-boto3-dynamodb", specifier = ">=1.28.18" }, { name = "mypy-boto3-glue", specifier = ">=1.28.18" }, { name = "prek", specifier = ">=0.2.1,<0.3" }, - { name = "protobuf", specifier = "==6.33.0" }, + { name = "protobuf", specifier = "==6.33.1" }, + { name = "pyarrow-stubs", specifier = ">=20.0.0.20251107" }, { name = "pyspark", extras = ["connect"], specifier = "==4.0.1" }, { name = "pytest", specifier = "==7.4.4" }, { name = "pytest-checkdocs", specifier = "==2.13.0" }, @@ -3821,7 +3835,7 @@ docs = [ { name = "mkdocs-autorefs", specifier = "==1.4.3" }, { name = "mkdocs-gen-files", specifier = "==0.5.0" }, { name = "mkdocs-literate-nav", specifier = "==0.6.2" }, - { name = "mkdocs-material", specifier = "==9.6.23" }, + { name = "mkdocs-material", specifier = "==9.7.0" }, { name = "mkdocs-material-extensions", specifier = "==1.3.1" }, { name = "mkdocs-section-index", specifier = "==0.3.10" }, { name = "mkdocstrings", specifier = "==0.30.1" },