Fix progress bar against non-UTF-8 encodings

LeSingh1 · LeSingh1 · commit 7c3dd30dfd12 · 2026-05-18T16:29:36.000-07:00
For #439. `file_progress()` sets the progress bar length to the file size in bytes (`os.path.getsize(file.name)`), but `UpdateWrapper.__iter__` called `update(len(line))`, which is the decoded character count. With UTF-16-LE input every character is 2 bytes, so the bar capped at 50%; UTF-32 capped at 25%; etc. Simon noted in the issue that the obvious fix (calling `.tell()` on the wrapped text stream) doesn't work because text mode disables it during iteration. The underlying binary buffer doesn't have that restriction though, so this tracks progress against `TextIOWrapper.buffer.tell()` when the wrapped object exposes one. For raw binary streams (no `.buffer` attribute) we keep the old behaviour, which was already byte-accurate. Added six regression tests in tests/test_utils.py covering UTF-8, UTF-16-LE, BOM-prefixed UTF-16, the sniff-style `BufferedReader` chain, a raw binary fallback, and the `.read()` path used by the JSON loader. Each asserts that the sum of update() calls equals the on-disk file size, which is what `click.progressbar` needs to reach 100%.
diff --git a/sqlite_utils/utils.py b/sqlite_utils/utils.py
@@ -215,15 +215,52 @@ class UpdateWrapper:
     def __init__(self, wrapped: io.IOBase, update: Callable[[int], None]) -> None:
         self._wrapped = wrapped
         self._update = update
+        # `file_progress` sets the progress bar length to the file size in
+        # bytes, but iterating a text-mode stream yields decoded characters,
+        # so reporting `len(line)` undercounts for any multi-byte encoding
+        # (UTF-16-LE caps the bar at ~50%, UTF-32 at ~25%, etc.). When the
+        # wrapped object is a text wrapper, track progress against the
+        # underlying binary buffer's position instead. See #439.
+        self._byte_source = getattr(wrapped, "buffer", None)
+        self._last_byte_pos = 0
+        if self._byte_source is not None:
+            try:
+                self._last_byte_pos = self._byte_source.tell()
+            except (io.UnsupportedOperation, OSError):
+                self._byte_source = None
+
+    def _advance_to_buffer_pos(self) -> None:
+        # Bring the progress bar up to the current byte position of the
+        # underlying binary buffer (which may have read ahead).
+        assert self._byte_source is not None
+        try:
+            pos = self._byte_source.tell()
+        except OSError:
+            return
+        delta = pos - self._last_byte_pos
+        if delta > 0:
+            self._update(delta)
+            self._last_byte_pos = pos
 
     def __iter__(self) -> Iterator[bytes]:
+        if self._byte_source is None:
+            for line in self._wrapped:
+                self._update(len(line))
+                yield line
+            return
         for line in self._wrapped:
-            self._update(len(line))
+            self._advance_to_buffer_pos()
             yield line
+        # The wrapper may have buffered the last chunk without emitting any
+        # more lines; flush the remaining bytes so the bar reaches 100%.
+        self._advance_to_buffer_pos()
 
     def read(self, size: int = -1) -> bytes:
         data = self._wrapped.read(size)
-        self._update(len(data))
+        if self._byte_source is not None:
+            self._advance_to_buffer_pos()
+        else:
+            self._update(len(data))
         return data
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -83,3 +83,96 @@ def test_maximize_csv_field_size_limit():
 )
 def test_flatten(input, expected):
     assert utils.flatten(input) == expected
+
+
+# Regression tests for #439: progress bar against multi-byte encodings
+
+
+def _collect_updates(rows):
+    """Iterate the wrapper, capturing every update() value."""
+    return list(rows)
+
+
+def _make_temp(content_bytes, tmp_path, name):
+    path = tmp_path / name
+    path.write_bytes(content_bytes)
+    return path
+
+
+def test_updatewrapper_utf8_reports_byte_lengths(tmp_path):
+    # Sanity: ASCII / UTF-8 still hits 100% (this was already correct,
+    # but we want a baseline to protect.)
+    raw = b"a,b\n1,2\n3,4\n"
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-8"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_utf16le_reports_byte_lengths(tmp_path):
+    # Without the fix this test fails: the bar only reaches len(decoded)
+    # which is half the raw byte length for UTF-16-LE.
+    raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_utf16le_with_bom_reaches_total_bytes(tmp_path):
+    # BOM-prefixed UTF-16. The BOM byte is consumed by the TextIOWrapper
+    # before iteration starts; we should still account for the full file
+    # size so the bar reaches 100%.
+    raw = "﻿" + "a,b\n1,2\n3,4\n"
+    raw_bytes = raw.encode("utf-16-le")
+    path = _make_temp(raw_bytes, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16"), updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw_bytes)
+
+
+def test_updatewrapper_through_buffered_reader(tmp_path):
+    # The --sniff path wraps the raw file in io.BufferedReader before the
+    # TextIOWrapper. Progress reporting must still resolve to the binary
+    # file's byte count.
+    raw = "a,b\n1,2\n3,4\n".encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        buffered = io.BufferedReader(fp, buffer_size=4096)
+        wrapper = utils.UpdateWrapper(
+            io.TextIOWrapper(buffered, encoding="utf-16-le"), updates.append
+        )
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_binary_file_unchanged(tmp_path):
+    # If the wrapped object is itself a raw binary file (no .buffer attr),
+    # we should keep the old behaviour: iterate yields bytes and len() is
+    # already the byte count.
+    raw = b"a,b\n1,2\n3,4\n"
+    path = _make_temp(raw, tmp_path, "in.csv")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(fp, updates.append)
+        _collect_updates(wrapper)
+    assert sum(updates) == len(raw)
+
+
+def test_updatewrapper_read_path_utf16le(tmp_path):
+    # The .read() path is used by the JSON loader (not the CSV iterator),
+    # but must agree with the iterator path on byte accounting.
+    raw = '{"a": 1}'.encode("utf-16-le")
+    path = _make_temp(raw, tmp_path, "in.json")
+    updates = []
+    with open(path, "rb") as fp:
+        wrapper = utils.UpdateWrapper(io.TextIOWrapper(fp, encoding="utf-16-le"), updates.append)
+        wrapper.read()
+    assert sum(updates) == len(raw)