apache
diff --git a/‎.github/ISSUE_TEMPLATE/iceberg_bug_report.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/ISSUE_TEMPLATE/iceberg_bug_report.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.markdownlint.yaml‎
Lines changed: 26 additions & 0 deletions b/‎.markdownlint.yaml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 10 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎Makefile‎
Lines changed: 24 additions & 15 deletions b/‎Makefile‎
Lines changed: 24 additions & 15 deletions
diff --git a/‎dev/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎dev/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mkdocs/docs/SUMMARY.md‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs/docs/SUMMARY.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mkdocs/docs/api.md‎
Lines changed: 20 additions & 19 deletions b/‎mkdocs/docs/api.md‎
Lines changed: 20 additions & 19 deletions
@@ -9,7 +9,8 @@ body:
       description: What Apache Iceberg version are you using?
       multiple: false
       options:
-        - "0.7.0 (latest release)"
+        - "0.7.1 (latest release)"
+        - "0.7.0"
         - "0.6.1"
         - "0.6.0"
         - "0.5.0"
 
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Default state for all rules
+default: true
+
+# MD013/line-length - Line length
+MD013: false
+
+# MD007/ul-indent - Unordered list indentation
+MD007:
+  indent: 4
@@ -46,17 +46,11 @@ repos:
     hooks:
       - id: pycln
         args: [--config=pyproject.toml]
-  - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.17
+  - repo: https://github.com/igorshubovych/markdownlint-cli
+    rev: v0.41.0
     hooks:
-      - id: mdformat
-        additional_dependencies:
-          - mdformat-black==0.1.1
-          - mdformat-config==0.1.3
-          - mdformat-beautysh==0.1.1
-          - mdformat-admon==1.0.1
-          - mdformat-mkdocs==1.0.1
-          - mdformat-frontmatter==2.0.1
+      - id: markdownlint
+        args: ["--fix"]
   - repo: https://github.com/pycqa/pydocstyle
     rev: 6.3.0
     hooks:
 
@@ -15,28 +15,37 @@
 # specific language governing permissions and limitations
 # under the License.
 
-install-poetry:
-	pip install poetry==1.8.3
 
-install-dependencies:
-	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray -E sql-postgres -E gcsfs -E sql-sqlite -E daft
+help:  ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+install-poetry:  ## Install poetry if the user has not done that yet.
+	 @if ! command -v poetry &> /dev/null; then \
+         echo "Poetry could not be found. Installing..."; \
+         pip install --user poetry==1.8.3; \
+     else \
+         echo "Poetry is already installed."; \
+     fi
+
+install-dependencies: ## Install dependencies including dev and all extras
+	poetry install --all-extras
 
 install: | install-poetry install-dependencies
 
-check-license:
+check-license: ## Check license headers
 	./dev/check-license
 
-lint:
+lint: ## lint
 	poetry run pre-commit run --all-files
 
-test:
+test: ## Run all unit tests, can add arguments with PYTEST_ARGS="-vv"
 	poetry run pytest tests/ -m "(unmarked or parametrize) and not integration" ${PYTEST_ARGS}
 
-test-s3:
+test-s3: # Run tests marked with s3, can add arguments with PYTEST_ARGS="-vv"
 	sh ./dev/run-minio.sh
 	poetry run pytest tests/ -m s3 ${PYTEST_ARGS}
 
-test-integration:
+test-integration: ## Run all integration tests, can add arguments with PYTEST_ARGS="-vv"
 	docker compose -f dev/docker-compose-integration.yml kill
 	docker compose -f dev/docker-compose-integration.yml rm -f
 	docker compose -f dev/docker-compose-integration.yml up -d
@@ -50,18 +59,18 @@ test-integration-rebuild:
 	docker compose -f dev/docker-compose-integration.yml rm -f
 	docker compose -f dev/docker-compose-integration.yml build --no-cache
 
-test-adlfs:
+test-adlfs: ## Run tests marked with adlfs, can add arguments with PYTEST_ARGS="-vv"
 	sh ./dev/run-azurite.sh
 	poetry run pytest tests/ -m adlfs ${PYTEST_ARGS}
 
-test-gcs:
+test-gcs: ## Run tests marked with gcs, can add arguments with PYTEST_ARGS="-vv"
 	sh ./dev/run-gcs-server.sh
 	poetry run  pytest tests/ -m gcs ${PYTEST_ARGS}
 
-test-coverage-unit:
+test-coverage-unit: # Run test with coverage for unit tests, can add arguments with PYTEST_ARGS="-vv"
 	poetry run coverage run --source=pyiceberg/ --data-file=.coverage.unit -m pytest tests/ -v -m "(unmarked or parametrize) and not integration" ${PYTEST_ARGS}
 
-test-coverage-integration:
+test-coverage-integration: # Run test with coverage for integration tests, can add arguments with PYTEST_ARGS="-vv"
 	docker compose -f dev/docker-compose-integration.yml kill
 	docker compose -f dev/docker-compose-integration.yml rm -f
 	docker compose -f dev/docker-compose-integration.yml up -d
@@ -72,14 +81,14 @@ test-coverage-integration:
 	docker compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
 	poetry run coverage run --source=pyiceberg/ --data-file=.coverage.integration -m pytest tests/ -v -m integration ${PYTEST_ARGS}
 
-test-coverage: | test-coverage-unit test-coverage-integration
+test-coverage: | test-coverage-unit test-coverage-integration ## Run all tests with coverage including unit and integration tests
 	poetry run coverage combine .coverage.unit .coverage.integration
 	poetry run coverage report -m --fail-under=90
 	poetry run coverage html
 	poetry run coverage xml
 
 
-clean:
+clean: ## Clean up the project Python working environment
 	@echo "Cleaning up Cython and Python cached files"
 	@rm -rf build dist *.egg-info
 	@find . -name "*.so" -exec echo Deleting {} \; -delete
 
@@ -39,7 +39,7 @@ WORKDIR ${SPARK_HOME}
 ENV SPARK_VERSION=3.5.0
 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
 ENV ICEBERG_VERSION=1.6.0
-ENV PYICEBERG_VERSION=0.7.0
+ENV PYICEBERG_VERSION=0.7.1
 
 RUN curl --retry 3 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
  && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
 
@@ -18,6 +18,7 @@
 <!-- prettier-ignore-start -->
 
 <!-- markdown-link-check-disable -->
+# Summary
 
 - [Getting started](index.md)
 - [Configuration](configuration.md)
 
@@ -146,6 +146,8 @@ catalog.create_table(
 )
 ```
 
+When the table is created, all IDs in the schema are re-assigned to ensure uniqueness.
+
 To create a table using a pyarrow schema:
 
 ```python
@@ -278,7 +280,7 @@ tbl.overwrite(df)
 
 The data is written to the table, and when the table is read using `tbl.scan().to_arrow()`:
 
-```
+```python
 pyarrow.Table
 city: string
 lat: double
@@ -301,7 +303,7 @@ tbl.append(df)
 
 When reading the table `tbl.scan().to_arrow()` you can see that `Groningen` is now also part of the table:
 
-```
+```python
 pyarrow.Table
 city: string
 lat: double
@@ -340,7 +342,7 @@ tbl.delete(delete_filter="city == 'Paris'")
 In the above example, any records where the city field value equals to `Paris` will be deleted.
 Running `tbl.scan().to_arrow()` will now yield:
 
-```
+```python
 pyarrow.Table
 city: string
 lat: double
@@ -360,7 +362,6 @@ To explore the table metadata, tables can be inspected.
 !!! tip "Time Travel"
     To inspect a tables's metadata with the time travel feature, call the inspect table method with the `snapshot_id` argument.
     Time travel is supported on all metadata tables except `snapshots` and `refs`.
-
     ```python
     table.inspect.entries(snapshot_id=805611270568163028)
     ```
@@ -375,7 +376,7 @@ Inspect the snapshots of the table:
 table.inspect.snapshots()
 ```
 
-```
+```python
 pyarrow.Table
 committed_at: timestamp[ms] not null
 snapshot_id: int64 not null
@@ -403,7 +404,7 @@ Inspect the partitions of the table:
 table.inspect.partitions()
 ```
 
-```
+```python
 pyarrow.Table
 partition: struct<dt_month: int32, dt_day: date32[day]> not null
   child 0, dt_month: int32
@@ -444,7 +445,7 @@ To show all the table's current manifest entries for both data and delete files.
 table.inspect.entries()
 ```
 
-```
+```python
 pyarrow.Table
 status: int8 not null
 snapshot_id: int64 not null
@@ -602,7 +603,7 @@ To show a table's known snapshot references:
 table.inspect.refs()
 ```
 
-```
+```python
 pyarrow.Table
 name: string not null
 type: string not null
@@ -627,7 +628,7 @@ To show a table's current file manifests:
 table.inspect.manifests()
 ```
 
-```
+```python
 pyarrow.Table
 content: int8 not null
 path: string not null
@@ -677,7 +678,7 @@ To show table metadata log entries:
 table.inspect.metadata_log_entries()
 ```
 
-```
+```python
 pyarrow.Table
 timestamp: timestamp[ms] not null
 file: string not null
@@ -700,7 +701,7 @@ To show a table's history:
 table.inspect.history()
 ```
 
-```
+```python
 pyarrow.Table
 made_current_at: timestamp[ms] not null
 snapshot_id: int64 not null
@@ -721,7 +722,7 @@ Inspect the data files in the current snapshot of the table:
 table.inspect.files()
 ```
 
-```
+```python
 pyarrow.Table
 content: int8 not null
 file_path: string not null
@@ -861,7 +862,7 @@ To show only data files or delete files in the current snapshot, use `table.insp
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.
 
-```
+```python
 # Given that these parquet files have schema consistent with the Iceberg table
 
 file_paths = [
@@ -941,7 +942,7 @@ with table.update_schema() as update:
 
 Now the table has the union of the two schemas `print(table.schema())`:
 
-```
+```python
 table {
   1: city: optional string
   2: lat: optional double
@@ -1191,7 +1192,7 @@ table.scan(
 
 This will return a PyArrow table:
 
-```
+```python
 pyarrow.Table
 VendorID: int64
 tpep_pickup_datetime: timestamp[us, tz=+00:00]
@@ -1233,7 +1234,7 @@ table.scan(
 
 This will return a Pandas dataframe:
 
-```
+```python
         VendorID      tpep_pickup_datetime     tpep_dropoff_datetime
 0              2 2021-04-01 00:28:05+00:00 2021-04-01 00:47:59+00:00
 1              1 2021-04-01 00:39:01+00:00 2021-04-01 00:57:39+00:00
@@ -1306,7 +1307,7 @@ ray_dataset = table.scan(
 
 This will return a Ray dataset:
 
-```
+```python
 Dataset(
     num_blocks=1,
     num_rows=1168798,
@@ -1357,7 +1358,7 @@ df = df.select("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime")
 
 This returns a Daft Dataframe which is lazily materialized. Printing `df` will display the schema:
 
-```
+```python
 ╭──────────┬───────────────────────────────┬───────────────────────────────╮
 │ VendorID ┆ tpep_pickup_datetime          ┆ tpep_dropoff_datetime         │
 │ ---      ┆ ---                           ┆ ---                           │
@@ -1375,7 +1376,7 @@ This is correctly optimized to take advantage of Iceberg features such as hidden
 df.show(2)
 ```
 
-```
+```python
 ╭──────────┬───────────────────────────────┬───────────────────────────────╮
 │ VendorID ┆ tpep_pickup_datetime          ┆ tpep_dropoff_datetime         │
 │ ---      ┆ ---                           ┆ ---                           │