Skip to content

Commit ca777d6

Browse files
committed
Added test for count() method and documentation for count()
1 parent 52d810e commit ca777d6

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed

mkdocs/docs/recipe-count.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
---
2+
title: Count Recipe
3+
---
4+
5+
# Counting Rows in an Iceberg Table
6+
7+
This recipe demonstrates how to use the `count()` function to efficiently count rows in an Iceberg table using PyIceberg.
8+
9+
## Basic Usage
10+
11+
To count all rows in a table:
12+
13+
```python
14+
from pyiceberg.catalog import load_catalog
15+
16+
catalog = load_catalog("default")
17+
table = catalog.load_table("default.cities")
18+
19+
row_count = table.count()
20+
print(f"Total rows in table: {row_count}")
21+
```
22+
23+
## Count with a Filter
24+
25+
To count only rows matching a filter:
26+
27+
```python
28+
from pyiceberg.expressions import EqualTo
29+
30+
count = table.scan(row_filter=EqualTo("city", "Amsterdam")).count()
31+
print(f"Rows with city == 'Amsterdam': {count}")
32+
```
33+
34+
## Notes
35+
- The `count()` method works for both catalog and static tables.
36+
- Filters can be applied using the `scan` API for more granular counts.
37+
- Deleted records are excluded from the count.
38+
39+
For more details, see the [API documentation](api.md).

tests/table/test_count.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import pytest
2+
from unittest.mock import MagicMock, Mock, patch
3+
from pyiceberg.table import DataScan
4+
from pyiceberg.expressions import AlwaysTrue
5+
6+
class DummyFile:
7+
def __init__(self, record_count):
8+
self.record_count = record_count
9+
10+
class DummyTask:
11+
def __init__(self, record_count, residual=None, delete_files=None):
12+
self.file = DummyFile(record_count)
13+
self.residual = residual if residual is not None else AlwaysTrue()
14+
self.delete_files = delete_files or []
15+
16+
def test_count_basic():
17+
# Create a mock table with the necessary attributes
18+
table = Mock(spec=DataScan)
19+
20+
# Mock the plan_files method to return our dummy task
21+
task = DummyTask(42, residual=AlwaysTrue(), delete_files=[])
22+
table.plan_files = MagicMock(return_value=[task])
23+
24+
# Import and call the actual count method
25+
from pyiceberg.table import DataScan as ActualDataScan
26+
table.count = ActualDataScan.count.__get__(table, ActualDataScan)
27+
28+
assert table.count() == 42
29+
30+
def test_count_empty():
31+
# Create a mock table with the necessary attributes
32+
table = Mock(spec=DataScan)
33+
34+
# Mock the plan_files method to return no tasks
35+
table.plan_files = MagicMock(return_value=[])
36+
37+
# Import and call the actual count method
38+
from pyiceberg.table import DataScan as ActualDataScan
39+
table.count = ActualDataScan.count.__get__(table, ActualDataScan)
40+
41+
assert table.count() == 0
42+
43+
def test_count_large():
44+
# Create a mock table with the necessary attributes
45+
table = Mock(spec=DataScan)
46+
47+
# Mock the plan_files method to return multiple tasks
48+
tasks = [
49+
DummyTask(500000, residual=AlwaysTrue(), delete_files=[]),
50+
DummyTask(500000, residual=AlwaysTrue(), delete_files=[]),
51+
]
52+
table.plan_files = MagicMock(return_value=tasks)
53+
54+
# Import and call the actual count method
55+
from pyiceberg.table import DataScan as ActualDataScan
56+
table.count = ActualDataScan.count.__get__(table, ActualDataScan)
57+
58+
assert table.count() == 1000000

0 commit comments

Comments
 (0)