Skip to content

Commit 95942ad

Browse files
EliEli
authored andcommitted
Further enforce utf-8
1 parent a0865a9 commit 95942ad

2 files changed

Lines changed: 72 additions & 42 deletions

File tree

dms_datastore/read_ts.py

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,13 @@ def infer_freq_robust(
16291629
)
16301630
return f
16311631

1632+
def _decode_context(path, pos, window=32):
1633+
with open(path, "rb") as f:
1634+
start = max(0, pos - window)
1635+
f.seek(start)
1636+
chunk = f.read(2 * window)
1637+
return " ".join(f"{b:02x}" for b in chunk)
1638+
16321639

16331640
def csv_retrieve_ts(
16341641
fpath_pattern,
@@ -1767,26 +1774,31 @@ def csv_retrieve_ts(
17671774
dset = None # to prevent holdover in memory
17681775
if column_names is None:
17691776
# warnings.filterwarnings('error')
1770-
# try:
1771-
dset = pd.read_csv(
1772-
m,
1773-
index_col=indexcol,
1774-
header=header,
1775-
skiprows=skiprows_spec,
1776-
sep=sep,
1777-
parse_dates=parsedates,
1778-
date_format=dateformat,
1779-
na_values=extra_na,
1780-
keep_default_na=True,
1781-
dtype=coltypes,
1782-
skipinitialspace=True,
1783-
nrows=nrows,
1784-
**dargs,
1785-
)
1786-
# print(dset.tail())
1787-
# except:
1788-
# print(f"Warning for: {m}")
1789-
# # reformat --inpath raw --outpath formatted
1777+
try:
1778+
dset = pd.read_csv(
1779+
m,
1780+
index_col=indexcol,
1781+
header=header,
1782+
skiprows=skiprows_spec,
1783+
sep=sep,
1784+
parse_dates=parsedates,
1785+
date_format=dateformat,
1786+
na_values=extra_na,
1787+
keep_default_na=True,
1788+
dtype=coltypes,
1789+
skipinitialspace=True,
1790+
nrows=nrows,
1791+
**dargs,
1792+
)
1793+
except UnicodeDecodeError as e:
1794+
context = _decode_context(m, e.start)
1795+
raise RuntimeError(
1796+
f"Invalid UTF-8 in file: {m}\n"
1797+
f"Byte offset: {e.start}\n"
1798+
f"Nearby bytes: {context}\n\n"
1799+
"File is likely cp1252/latin-1 encoded."
1800+
) from e
1801+
17901802

17911803
if header is None:
17921804
# This is essentially a fixup for vtide, which I'm not
@@ -1801,23 +1813,32 @@ def csv_retrieve_ts(
18011813
dset.columns = [x.strip() for x in dset.columns]
18021814

18031815
else:
1804-
dset = pd.read_csv(
1805-
m,
1806-
index_col=indexcol,
1807-
header=header,
1808-
skiprows=skiprows_spec,
1809-
sep=sep,
1810-
parse_dates=parsedates,
1811-
date_format=dateformat,
1812-
na_values=extra_na,
1813-
keep_default_na=True,
1814-
dtype=coltypes,
1815-
names=column_names,
1816-
skipinitialspace=True,
1817-
nrows=nrows,
1818-
**dargs,
1819-
)
1820-
1816+
try:
1817+
dset = pd.read_csv(
1818+
m,
1819+
index_col=indexcol,
1820+
header=header,
1821+
skiprows=skiprows_spec,
1822+
sep=sep,
1823+
parse_dates=parsedates,
1824+
date_format=dateformat,
1825+
na_values=extra_na,
1826+
keep_default_na=True,
1827+
dtype=coltypes,
1828+
names=column_names,
1829+
skipinitialspace=True,
1830+
nrows=nrows,
1831+
**dargs,
1832+
)
1833+
except UnicodeDecodeError as e:
1834+
context = _decode_context(m, e.start)
1835+
raise RuntimeError(
1836+
f"Invalid UTF-8 in file: {m}\n"
1837+
f"Byte offset: {e.start}\n"
1838+
f"Nearby bytes: {context}\n\n"
1839+
"File is likely cp1252/latin-1 encoded."
1840+
) from e
1841+
18211842
if qaqc_selector is not None:
18221843
# It is costly to try to handle blanks differently for both data
18231844
# (for which we usually want blanks to be NaN and alphanumeric flags.

dms_datastore/reconcile_data.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,22 +137,31 @@ class VettedWrite:
137137

138138

139139
def file_empty(fname: str, comment: str = "#") -> bool:
140-
"""Check if a CSV file is empty or contains only comments/blank lines."""
140+
"""Check if a CSV file is empty or contains only comments/blank lines.
141+
142+
This function is intentionally binary-safe so it can be used even when a file
143+
has an encoding problem.
144+
"""
141145
if not os.path.exists(fname):
142146
return True
143147

144148
if os.path.getsize(fname) == 0:
145149
return True
146150

147-
with open(fname, "r", encoding="utf-8") as f:
151+
comment_b = comment.encode("ascii")
152+
153+
with open(fname, "rb") as f:
148154
for line in f:
149-
if not line.strip().startswith(comment) and line.strip():
150-
return False
155+
stripped = line.strip()
156+
if not stripped:
157+
continue
158+
if stripped.startswith(comment_b):
159+
continue
160+
return False
151161

152162
return True
153163

154164

155-
156165
def _quarantine_file(fname, quarantine_dir="quarantine"):
157166
if not os.path.exists(quarantine_dir):
158167
os.makedirs(quarantine_dir)

0 commit comments

Comments
 (0)