@@ -1629,6 +1629,13 @@ def infer_freq_robust(
16291629 )
16301630 return f
16311631
1632+ def _decode_context (path , pos , window = 32 ):
1633+ with open (path , "rb" ) as f :
1634+ start = max (0 , pos - window )
1635+ f .seek (start )
1636+ chunk = f .read (2 * window )
1637+ return " " .join (f"{ b :02x} " for b in chunk )
1638+
16321639
16331640def csv_retrieve_ts (
16341641 fpath_pattern ,
@@ -1767,26 +1774,31 @@ def csv_retrieve_ts(
17671774 dset = None # to prevent holdover in memory
17681775 if column_names is None :
17691776 # warnings.filterwarnings('error')
1770- # try:
1771- dset = pd .read_csv (
1772- m ,
1773- index_col = indexcol ,
1774- header = header ,
1775- skiprows = skiprows_spec ,
1776- sep = sep ,
1777- parse_dates = parsedates ,
1778- date_format = dateformat ,
1779- na_values = extra_na ,
1780- keep_default_na = True ,
1781- dtype = coltypes ,
1782- skipinitialspace = True ,
1783- nrows = nrows ,
1784- ** dargs ,
1785- )
1786- # print(dset.tail())
1787- # except:
1788- # print(f"Warning for: {m}")
1789- # # reformat --inpath raw --outpath formatted
1777+ try :
1778+ dset = pd .read_csv (
1779+ m ,
1780+ index_col = indexcol ,
1781+ header = header ,
1782+ skiprows = skiprows_spec ,
1783+ sep = sep ,
1784+ parse_dates = parsedates ,
1785+ date_format = dateformat ,
1786+ na_values = extra_na ,
1787+ keep_default_na = True ,
1788+ dtype = coltypes ,
1789+ skipinitialspace = True ,
1790+ nrows = nrows ,
1791+ ** dargs ,
1792+ )
1793+ except UnicodeDecodeError as e :
1794+ context = _decode_context (m , e .start )
1795+ raise RuntimeError (
1796+ f"Invalid UTF-8 in file: { m } \n "
1797+ f"Byte offset: { e .start } \n "
1798+ f"Nearby bytes: { context } \n \n "
1799+ "File is likely cp1252/latin-1 encoded."
1800+ ) from e
1801+
17901802
17911803 if header is None :
17921804 # This is essentially a fixup for vtide, which I'm not
@@ -1801,23 +1813,32 @@ def csv_retrieve_ts(
18011813 dset .columns = [x .strip () for x in dset .columns ]
18021814
18031815 else :
1804- dset = pd .read_csv (
1805- m ,
1806- index_col = indexcol ,
1807- header = header ,
1808- skiprows = skiprows_spec ,
1809- sep = sep ,
1810- parse_dates = parsedates ,
1811- date_format = dateformat ,
1812- na_values = extra_na ,
1813- keep_default_na = True ,
1814- dtype = coltypes ,
1815- names = column_names ,
1816- skipinitialspace = True ,
1817- nrows = nrows ,
1818- ** dargs ,
1819- )
1820-
1816+ try :
1817+ dset = pd .read_csv (
1818+ m ,
1819+ index_col = indexcol ,
1820+ header = header ,
1821+ skiprows = skiprows_spec ,
1822+ sep = sep ,
1823+ parse_dates = parsedates ,
1824+ date_format = dateformat ,
1825+ na_values = extra_na ,
1826+ keep_default_na = True ,
1827+ dtype = coltypes ,
1828+ names = column_names ,
1829+ skipinitialspace = True ,
1830+ nrows = nrows ,
1831+ ** dargs ,
1832+ )
1833+ except UnicodeDecodeError as e :
1834+ context = _decode_context (m , e .start )
1835+ raise RuntimeError (
1836+ f"Invalid UTF-8 in file: { m } \n "
1837+ f"Byte offset: { e .start } \n "
1838+ f"Nearby bytes: { context } \n \n "
1839+ "File is likely cp1252/latin-1 encoded."
1840+ ) from e
1841+
18211842 if qaqc_selector is not None :
18221843 # It is costly to try to handle blanks differently for both data
18231844 # (for which we usually want blanks to be NaN and alphanumeric flags.
0 commit comments