diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 0b7aa22e..06e85708 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -29,9 +29,16 @@ python -m build --outdir ./build ``` sphinx-build -W -b html ./docs ./build/docs ``` - +## Install deps for running tests +``` +pip install -e . +``` ## Run tests ``` tox ``` +or +``` +pytest +``` diff --git a/dev-requirements.txt b/dev-requirements.txt index 03be9b18..b55238f6 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -6,4 +6,5 @@ pydata-sphinx-theme == 0.11.0 pytest pytest-cov sphinx == 5.3.0 -myst-parser<2.0 \ No newline at end of file +myst-parser<2.0 +tox diff --git a/fourinsight/engineroom/utils/_core.py b/fourinsight/engineroom/utils/_core.py index 13f370ec..ec4585ce 100644 --- a/fourinsight/engineroom/utils/_core.py +++ b/fourinsight/engineroom/utils/_core.py @@ -45,7 +45,7 @@ def pull(self, raise_on_missing=True): current_pos = self.tell() self.seek(0) try: - characters_written = self._pull() + self._pull() except self._SOURCE_NOT_FOUND_ERROR as e: if raise_on_missing: self.seek(current_pos) @@ -53,7 +53,9 @@ def pull(self, raise_on_missing=True): else: self.truncate(0) else: - self.truncate(characters_written) + self.flush() + pos = self.tell() + self.truncate(pos) def push(self): """ @@ -143,11 +145,11 @@ def __repr__(self): return f"LocalFileHandler {self._path.resolve()}" def _pull(self): - return self.write(open(self._path, mode="r").read()) + return self.write(open(self._path, mode="r", encoding=self.encoding).read()) def _push(self): self._path.parent.mkdir(parents=True, exist_ok=True) - with open(self._path, mode="w") as f: + with open(self._path, mode="w", encoding=self.encoding) as f: f.write(self.getvalue()) @@ -343,6 +345,7 @@ def __init__(self, headers, handler=None, indexing_mode="auto"): raise ValueError("Indexing mode must be 'auto' or 'timestamp'.") self._dataframe = pd.DataFrame(columns=headers.keys()).astype(self._headers) + self.encoding = getattr(self._handler, "encoding", "utf-8") def __repr__(self): return repr(self._dataframe) @@ -470,6 +473,7 @@ def pull(self, raise_on_missing=True, strict=True): parse_dates=True, dtype=self._headers, date_format="ISO8601", + encoding=self.encoding, ) if strict and set(df_source.columns) != set(self._headers.keys()): @@ -501,11 +505,19 @@ def push(self): self._handler.truncate() try: self._dataframe.to_csv( - self._handler, sep=",", index=True, lineterminator="\n" + self._handler, + sep=",", + index=True, + lineterminator="\n", + encoding=self.encoding, ) except TypeError: # for backward compatibility (remove after 2024-06-01) self._dataframe.to_csv( - self._handler, sep=",", index=True, line_terminator="\n" + self._handler, + sep=",", + index=True, + line_terminator="\n", + encoding=self.encoding, ) self._handler.push() diff --git a/tests/test_core.py b/tests/test_core.py index ed3b59bd..e4601821 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1239,6 +1239,60 @@ def test_delete_rows_truncate_int_both_none(self): pd.testing.assert_frame_equal(df_out, df_expect) + def test_csv_parsing_matches_pandas(self): + header_names = [ + "OrganizationName", + "timestamp", + "timestamp_end", + "dcount_ExternalId", + "serviceAccount", + ] + + file_name = Path(__file__).parent / "testdata/drio_sdk_usage_mod.csv" + + headers = {header: str for header in header_names} + handler = LocalFileHandler(file_name) + collector = ResultCollector(headers, handler=handler) + collector.pull(raise_on_missing=True, strict=True) + df = collector.dataframe + + df_expected = pd.read_csv( + file_name, index_col=0, encoding="utf-8", dtype=headers + ) + + assert ( + df_expected.iloc[-1]["dcount_ExternalId"] + == df.iloc[-1]["dcount_ExternalId"] + ) + + assert df_expected.iloc[0]["OrganizationName"] == df.iloc[0]["OrganizationName"] + + def test_parsing_norwegian_letters(self): + header_names = [ + "OrganizationName", + "timestamp", + "timestamp_end", + "dcount_ExternalId", + "serviceAccount", + ] + + file_name = Path(__file__).parent / "testdata/drio_sdk_usage_mod2.csv" + + headers = {header: str for header in header_names} + handler = LocalFileHandler(file_name) + collector = ResultCollector(headers, handler=handler) + collector.pull(raise_on_missing=True, strict=True) + df = collector.dataframe + + df_expected = pd.read_csv( + file_name, + index_col=0, + dtype=headers, + encoding="utf-8", + ) + + assert df_expected.iloc[-1]["serviceAccount"] == df.iloc[-1]["serviceAccount"] + def test__build_download_url(previous_file_names): app_id = "12345" diff --git a/tests/testdata/drio_sdk_usage_mod.csv b/tests/testdata/drio_sdk_usage_mod.csv new file mode 100644 index 00000000..c5ccc091 --- /dev/null +++ b/tests/testdata/drio_sdk_usage_mod.csv @@ -0,0 +1,7 @@ +,OrganizationName,timestamp,timestamp_end,dcount_ExternalId,serviceAccount +851,Vår Energi,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,1,False +855,Subsea 7,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,7,False +873,Subsea 7,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,7,False +874,Vår Energi,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,1,False +879,4Subsea,2026-01-01 00:00:00+00:00,2026-01-31 00:00:00+00:00,1,False +880,Unknown,2026-01-01 00:00:00+00:00,2026-01-31 00:00:00+00:00,1, diff --git a/tests/testdata/drio_sdk_usage_mod2.csv b/tests/testdata/drio_sdk_usage_mod2.csv new file mode 100644 index 00000000..5dc67823 --- /dev/null +++ b/tests/testdata/drio_sdk_usage_mod2.csv @@ -0,0 +1,7 @@ +,OrganizationName,timestamp,timestamp_end,dcount_ExternalId,serviceAccount +851,Vår Energi,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,1,False +855,Subsea 7,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,7,False +873,Subsea 7,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,7,False +874,Vår Energi,2025-12-01 00:00:00+00:00,2025-12-31 00:00:00+00:00,1,False +879,4Subsea,2026-01-01 00:00:00+00:00,2026-01-31 00:00:00+00:00,1,False +880,Unknown,2026-01-01 00:00:00+00:00,2026-01-31 00:00:00+00:00,1,False