python-bigquery-dataframes/bigframes/display/anywidget.py at c312169f63f89cfdb33eeeff0e507319289d8f39 · googleapis/python-bigquery-dataframes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Interactive, paginated table widget for BigFrames DataFrames."""

from __future__ import annotations

import dataclasses
from importlib import resources
import functools
import math
import threading
from typing import Any, Iterator, Optional
import uuid

import pandas as pd

import bigframes
from bigframes.core import blocks
import bigframes.dataframe
import bigframes.display.html
import bigframes.dtypes as dtypes

# anywidget and traitlets are optional dependencies. We don't want the import of
# this module to fail if they aren't installed, though. Instead, we try to
# limit the surface that these packages could affect. This makes unit testing
# easier and ensures we don't accidentally make these required packages.
try:
    import anywidget
    import traitlets

    _ANYWIDGET_INSTALLED = True
except Exception:
    _ANYWIDGET_INSTALLED = False

_WIDGET_BASE: type[Any]
if _ANYWIDGET_INSTALLED:
    _WIDGET_BASE = anywidget.AnyWidget
else:
    _WIDGET_BASE = object


@dataclasses.dataclass(frozen=True)
class _SortState:
    column: str
    ascending: bool


class TableWidget(_WIDGET_BASE):
    """An interactive, paginated table widget for BigFrames DataFrames.

    This widget provides a user-friendly way to display and navigate through
    large BigQuery DataFrames within a Jupyter environment.
    """

    page = traitlets.Int(0).tag(sync=True)
    page_size = traitlets.Int(0).tag(sync=True)
    row_count = traitlets.Int(allow_none=True, default_value=None).tag(sync=True)
    table_html = traitlets.Unicode("").tag(sync=True)
    sort_column = traitlets.Unicode("").tag(sync=True)
    sort_ascending = traitlets.Bool(True).tag(sync=True)
    orderable_columns = traitlets.List(traitlets.Unicode(), []).tag(sync=True)
    _initial_load_complete = traitlets.Bool(False).tag(sync=True)
    _batches: Optional[blocks.PandasBatches] = None
    _error_message = traitlets.Unicode(allow_none=True, default_value=None).tag(
        sync=True
    )

    def __init__(self, dataframe: bigframes.dataframe.DataFrame):
        """Initialize the TableWidget.

        Args:
            dataframe: The Bigframes Dataframe to display in the widget.
        """
        if not _ANYWIDGET_INSTALLED:
            raise ImportError(
                "Please `pip install anywidget traitlets` or "
                "`pip install 'bigframes[anywidget]'` to use TableWidget."
            )

        self._dataframe = dataframe

        super().__init__()

        # Initialize attributes that might be needed by observers first
        self._table_id = str(uuid.uuid4())
        self._all_data_loaded = False
        self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
        self._cached_batches: list[pd.DataFrame] = []
        self._last_sort_state: Optional[_SortState] = None
        # Lock to ensure only one thread at a time is updating the table HTML.
        self._setting_html_lock = threading.Lock()

        # respect display options for initial page size
        initial_page_size = bigframes.options.display.max_rows

        # set traitlets properties that trigger observers
        # TODO(b/462525985): Investigate and improve TableWidget UX for DataFrames with a large number of columns.
        self.page_size = initial_page_size
        # TODO(b/469861913): Nested columns from structs (e.g., 'struct_col.name') are not currently sortable.
        # TODO(b/463754889): Support non-string column labels for sorting.
        if all(isinstance(col, str) for col in dataframe.columns):
            self.orderable_columns = [
                str(col_name)
                for col_name, dtype in dataframe.dtypes.items()
                if dtypes.is_orderable(dtype)
            ]
        else:
            self.orderable_columns = []

        self._initial_load()

        # Signals to the frontend that the initial data load is complete.
        # Also used as a guard to prevent observers from firing during initialization.
        self._initial_load_complete = True

    def _initial_load(self) -> None:
        """Get initial data and row count."""
        # obtain the row counts
        # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
        # before we get here so that the count might already be cached.
        self._reset_batches_for_new_page_size()

        if self._batches is None:
            self._error_message = (
                "Could not retrieve data batches. Data might be unavailable or "
                "an error occurred."
            )
            self.row_count = None
        elif self._batches.total_rows is None:
            # Total rows is unknown, this is an expected state.
            # TODO(b/461536343): Cheaply discover if we have exactly 1 page.
            # There are cases where total rows is not set, but there are no additional
            # pages. We could disable the "next" button in these cases.
            self.row_count = None
        else:
            self.row_count = self._batches.total_rows

        # get the initial page
        self._set_table_html()

    @traitlets.observe("_initial_load_complete")
    def _on_initial_load_complete(self, change: dict[str, Any]):
        if change["new"]:
            self._set_table_html()

    @functools.cached_property
    def _esm(self):
        """Load JavaScript code from external file."""
        return resources.read_text(bigframes.display, "table_widget.js")

    @functools.cached_property
    def _css(self):
        """Load CSS code from external file."""
        return resources.read_text(bigframes.display, "table_widget.css")

    @traitlets.validate("page")
    def _validate_page(self, proposal: dict[str, Any]) -> int:
        """Validate and clamp the page number to a valid range.

        Args:
            proposal: A dictionary from the traitlets library containing the
                proposed change. The new value is in proposal["value"].

        Returns:
            The validated and clamped page number as an integer.
        """
        value = proposal["value"]

        if value < 0:
            raise ValueError("Page number cannot be negative.")

        # If truly empty or invalid page size, stay on page 0.
        # This handles cases where row_count is 0 or page_size is 0, preventing
        # division by zero or nonsensical pagination, regardless of row_count being None.
        if self.row_count == 0 or self.page_size == 0:
            return 0

        # If row count is unknown, allow any non-negative page. The previous check
        # ensures that invalid page_size (0) is already handled.
        if self.row_count is None:
            return value

        # Calculate the zero-indexed maximum page number.
        max_page = max(0, math.ceil(self.row_count / self.page_size) - 1)

        # Clamp the proposed value to the valid range [0, max_page].
        return max(0, min(value, max_page))

    @traitlets.validate("page_size")
    def _validate_page_size(self, proposal: dict[str, Any]) -> int:
        """Validate page size to ensure it's positive and reasonable.

        Args:
            proposal: A dictionary from the traitlets library containing the
                proposed change. The new value is in proposal["value"].

        Returns:
            The validated page size as an integer.
        """
        value = proposal["value"]

        # Ensure page size is positive and within reasonable bounds
        if value <= 0:
            return self.page_size  # Keep current value

        # Cap at reasonable maximum to prevent performance issues
        max_page_size = 1000
        return min(value, max_page_size)

    def _get_next_batch(self) -> bool:
        """
        Gets the next batch of data from the generator and appends to cache.

        Returns:
            True if a batch was successfully loaded, False otherwise.
        """
        if self._all_data_loaded:
            return False

        try:
            iterator = self._batch_iterator
            batch = next(iterator)
            self._cached_batches.append(batch)
            return True
        except StopIteration:
            self._all_data_loaded = True
            return False

    @property
    def _batch_iterator(self) -> Iterator[pd.DataFrame]:
        """Lazily initializes and returns the batch iterator."""
        if self._batch_iter is None:
            if self._batches is None:
                self._batch_iter = iter([])
            else:
                self._batch_iter = iter(self._batches)
        return self._batch_iter

    @property
    def _cached_data(self) -> pd.DataFrame:
        """Combine all cached batches into a single DataFrame."""
        if not self._cached_batches:
            return pd.DataFrame(columns=self._dataframe.columns)
        return pd.concat(self._cached_batches)

    def _reset_batch_cache(self) -> None:
        """Resets batch caching attributes."""
        self._cached_batches = []
        self._batch_iter = None
        self._all_data_loaded = False

    def _reset_batches_for_new_page_size(self) -> None:
        """Reset the batch iterator when page size changes."""
        self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size)

        self._reset_batch_cache()

    def _set_table_html(self) -> None:
        """Sets the current html data based on the current page and page size."""
        new_page = None
        with self._setting_html_lock:
            if self._error_message:
                self.table_html = (
                    f"<div class='bigframes-error-message'>"
                    f"{self._error_message}</div>"
                )
                return

            # Apply sorting if a column is selected
            df_to_display = self._dataframe
            if self.sort_column:
                # TODO(b/463715504): Support sorting by index columns.
                df_to_display = df_to_display.sort_values(
                    by=self.sort_column, ascending=self.sort_ascending
                )

            # Reset batches when sorting changes
            if self._last_sort_state != _SortState(
                self.sort_column, self.sort_ascending
            ):
                self._batches = df_to_display.to_pandas_batches(
                    page_size=self.page_size
                )
                self._reset_batch_cache()
                self._last_sort_state = _SortState(
                    self.sort_column, self.sort_ascending
                )
                if self.page != 0:
                    new_page = 0  # Reset to first page

            if new_page is None:
                start = self.page * self.page_size
                end = start + self.page_size

                # fetch more data if the requested page is outside our cache
                cached_data = self._cached_data
                while len(cached_data) < end and not self._all_data_loaded:
                    if self._get_next_batch():
                        cached_data = self._cached_data
                    else:
                        break

                # Get the data for the current page
                page_data = cached_data.iloc[start:end].copy()

                # Handle case where user navigated beyond available data with unknown row count
                is_unknown_count = self.row_count is None
                is_beyond_data = (
                    self._all_data_loaded and len(page_data) == 0 and self.page > 0
                )
                if is_unknown_count and is_beyond_data:
                    # Calculate the last valid page (zero-indexed)
                    total_rows = len(cached_data)
                    last_valid_page = max(0, math.ceil(total_rows / self.page_size) - 1)
                    if self.page != last_valid_page:
                        new_page = last_valid_page

            if new_page is None:
                # Handle index display
                if self._dataframe._block.has_index:
                    is_unnamed_single_index = (
                        page_data.index.name is None
                        and not isinstance(page_data.index, pd.MultiIndex)
                    )
                    page_data = page_data.reset_index()
                    if is_unnamed_single_index and "index" in page_data.columns:
                        page_data.rename(columns={"index": ""}, inplace=True)

                # Default index - include as "Row" column if no index was present originally
                if not self._dataframe._block.has_index:
                    page_data.insert(
                        0, "Row", range(start + 1, start + len(page_data) + 1)
                    )

                # Generate HTML table
                self.table_html = bigframes.display.html.render_html(
                    dataframe=page_data,
                    table_id=f"table-{self._table_id}",
                )

        if new_page is not None:
            # Navigate to the new page. This triggers the observer, which will
            # re-enter _set_table_html. Since we've released the lock, this is safe.
            self.page = new_page

    @traitlets.observe("sort_column", "sort_ascending")
    def _sort_changed(self, _change: dict[str, Any]):
        """Handler for when sorting parameters change from the frontend."""
        self._set_table_html()

    @traitlets.observe("page")
    def _page_changed(self, _change: dict[str, Any]) -> None:
        """Handler for when the page number is changed from the frontend."""
        if not self._initial_load_complete:
            return
        self._set_table_html()

    @traitlets.observe("page_size")
    def _page_size_changed(self, _change: dict[str, Any]) -> None:
        """Handler for when the page size is changed from the frontend."""
        if not self._initial_load_complete:
            return
        # Reset the page to 0 when page size changes to avoid invalid page states
        self.page = 0
        # Reset the sort state to default (no sort)
        self.sort_column = ""
        self.sort_ascending = True

        # Reset batches to use new page size for future data fetching
        self._reset_batches_for_new_page_size()

        # Update the table display
        self._set_table_html()