Skip to content

Commit 30e0bff

Browse files
authored
Resolve timezone alias links via tzdata.zi when loading transition tables (rapidsai#22293)
`make_timezone_transition_table` fails for IANA alias zones like `US/Pacific` that modern tzdata ships only as `Link` entries in `tzdata.zi` rather than as standalone TZif files. This leads to ORC reader failing to read non-UTC files on some systems. With this PR, when the direct open misses, libcudf parses `tzdata.zi` and follows `Link` entries to find a real TZif file name. Added C++ tests for the timezone API (none existed prior) and expanded Python tests to cover the code added in this PR. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Matthew Roeschke (https://github.com/mroeschke) - Shruti Shivakumar (https://github.com/shrshi) URL: rapidsai#22293
1 parent addd9c9 commit 30e0bff

5 files changed

Lines changed: 368 additions & 13 deletions

File tree

cpp/include/cudf/timezone.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#pragma once
@@ -33,6 +33,10 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
3333
* transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will
3434
* still include the future entries, which will all have the same offset.
3535
*
36+
* If `timezone_name` does not resolve to a TZif file in `tzif_dir`, the directory's `tzdata.zi`
37+
* index file is consulted (if present) and `Link` entries are followed to a canonical zone name
38+
* whose TZif file exists on disk.
39+
*
3640
* @param tzif_dir The directory where the TZif files are located
3741
* @param timezone_name standard timezone name (for example, "America/Los_Angeles")
3842
* @param stream CUDA stream used for device memory operations and kernel launches

cpp/src/datetime/timezone.cpp

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
#include <cudf/detail/nvtx/ranges.hpp>
@@ -10,6 +10,12 @@
1010
#include <algorithm>
1111
#include <filesystem>
1212
#include <fstream>
13+
#include <optional>
14+
#include <string>
15+
#include <string_view>
16+
#include <system_error>
17+
#include <unordered_map>
18+
#include <unordered_set>
1319

1420
namespace cudf {
1521

@@ -18,6 +24,67 @@ namespace {
1824
constexpr uint32_t tzif_magic = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24);
1925
std::string const tzif_system_directory = "/usr/share/zoneinfo/";
2026

27+
/**
28+
* @brief Resolves a timezone alias via `Link` entries in `<tzif_dir>/tzdata.zi`.
29+
*
30+
* Returns the canonical name if it resolves to a TZif file on disk, else `std::nullopt`.
31+
*/
32+
std::optional<std::string> resolve_tz_alias(std::filesystem::path const& tzif_dir,
33+
std::string_view timezone_name)
34+
{
35+
std::ifstream fin{tzif_dir / "tzdata.zi"};
36+
if (!fin) { return std::nullopt; }
37+
38+
std::unordered_map<std::string, std::string> links;
39+
for (std::string line; std::getline(fin, line);) {
40+
// Handle CRLF line endings: `std::getline` keeps a trailing `\r`.
41+
if (!line.empty() && line.back() == '\r') { line.pop_back(); }
42+
43+
std::string_view v{line};
44+
auto const take_token = [&v]() -> std::string_view {
45+
auto const ws_start = v.find_first_not_of(" \t");
46+
if (ws_start == std::string_view::npos) {
47+
v = {};
48+
return {};
49+
}
50+
v.remove_prefix(ws_start);
51+
auto const tok_end = v.find_first_of(" \t");
52+
auto const tok = v.substr(0, tok_end);
53+
v.remove_prefix(tok_end == std::string_view::npos ? v.size() : tok_end);
54+
return tok;
55+
};
56+
57+
auto const directive = take_token();
58+
if (directive.empty() || directive.front() == '#') { continue; }
59+
if (directive != "Link" && directive != "L") { continue; }
60+
61+
auto const target = take_token();
62+
auto const alias = take_token();
63+
if (target.empty() || alias.empty()) { continue; }
64+
links.emplace(std::string{alias}, std::string{target});
65+
}
66+
67+
// Walk the alias chain, detecting cycles via the set of names already visited.
68+
std::string name{timezone_name};
69+
std::unordered_set<std::string> visited{name};
70+
while (true) {
71+
auto const it = links.find(name);
72+
if (it == links.end()) { break; }
73+
if (!visited.insert(it->second).second) {
74+
// Cycle in tzdata.zi; bail out instead of looping.
75+
return std::nullopt;
76+
}
77+
name = it->second;
78+
}
79+
80+
// No link followed
81+
if (std::string_view{name} == timezone_name) { return std::nullopt; }
82+
83+
std::error_code ec;
84+
if (!std::filesystem::exists(tzif_dir / name, ec)) { return std::nullopt; }
85+
return name;
86+
}
87+
2188
#pragma pack(push, 1)
2289
/**
2390
* @brief 32-bit TZif header
@@ -120,12 +187,30 @@ struct timezone_file {
120187
{
121188
using std::ios_base;
122189

123-
// Open the input file
124-
auto const tz_filename =
125-
std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
190+
auto const tz_dir = std::filesystem::path{tzif_dir.value_or(tzif_system_directory)};
191+
auto const tz_filename = tz_dir / timezone_name;
192+
auto const open_flags = ios_base::in | ios_base::binary | ios_base::ate;
126193
std::ifstream fin;
127-
fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
128-
CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'");
194+
fin.open(tz_filename, open_flags);
195+
196+
// Fall back to resolving the alias through the tzdata.zi index file
197+
if (!fin) {
198+
if (auto const resolved = resolve_tz_alias(tz_dir, timezone_name)) {
199+
fin.clear();
200+
fin.open(tz_dir / *resolved, open_flags);
201+
}
202+
}
203+
if (!fin) {
204+
auto err = std::string{"Failed to open the timezone file '"} + tz_filename.string() + "'";
205+
std::error_code ec;
206+
if (!std::filesystem::exists(tz_dir / "tzdata.zi", ec)) {
207+
err += " (no tzdata.zi present in '" + tz_dir.string() + "' to resolve aliases)";
208+
} else {
209+
err += " (no matching Link entry in '" + (tz_dir / "tzdata.zi").string() +
210+
"', or its target is also missing)";
211+
}
212+
CUDF_FAIL(err);
213+
}
129214
auto const file_size = fin.tellg();
130215
fin.seekg(0);
131216

cpp/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ ConfigureTest(IS_SORTED_TEST sort/is_sorted_tests.cpp)
186186
# ##################################################################################################
187187
# * datetime tests --------------------------------------------------------------------------------
188188
ConfigureTest(DATETIME_OPS_TEST datetime/datetime_ops_test.cpp)
189+
ConfigureTest(TIMEZONE_TEST datetime/timezone_test.cpp)
189190

190191
# ##################################################################################################
191192
# * hashing tests ---------------------------------------------------------------------------------
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include <cudf_test/base_fixture.hpp>
7+
#include <cudf_test/table_utilities.hpp>
8+
#include <cudf_test/testing_main.hpp>
9+
10+
#include <cudf/table/table.hpp>
11+
#include <cudf/timezone.hpp>
12+
#include <cudf/utilities/error.hpp>
13+
14+
#include <array>
15+
#include <cerrno>
16+
#include <cstdlib>
17+
#include <cstring>
18+
#include <filesystem>
19+
#include <fstream>
20+
#include <optional>
21+
#include <string>
22+
#include <string_view>
23+
#include <system_error>
24+
#include <vector>
25+
26+
namespace {
27+
28+
constexpr std::string_view canonical_zone_name = "America/Los_Angeles";
29+
30+
// Candidate locations for the system TZif database, probed in order.
31+
constexpr std::array<std::string_view, 3> candidate_tz_dirs{
32+
"/usr/share/zoneinfo",
33+
"/usr/lib/zoneinfo",
34+
"/etc/zoneinfo",
35+
};
36+
37+
std::optional<std::filesystem::path> find_system_tz_dir()
38+
{
39+
static std::optional<std::filesystem::path> const cached = [] {
40+
namespace fs = std::filesystem;
41+
auto const usable = [](fs::path const& dir) {
42+
std::error_code ec;
43+
return fs::is_regular_file(dir / canonical_zone_name, ec);
44+
};
45+
if (auto const* env = std::getenv("TZDIR")) {
46+
if (fs::path const d{env}; usable(d)) { return std::optional{d}; }
47+
}
48+
for (auto const sv : candidate_tz_dirs) {
49+
if (fs::path const d{sv}; usable(d)) { return std::optional{d}; }
50+
}
51+
return std::optional<fs::path>{};
52+
}();
53+
return cached;
54+
}
55+
56+
} // namespace
57+
58+
class TimezoneTransitionTableTest : public cudf::test::BaseFixture {};
59+
60+
TEST_F(TimezoneTransitionTableTest, UtcShortCircuitsWithoutReadingFile)
61+
{
62+
auto const table = cudf::make_timezone_transition_table(std::nullopt, "UTC");
63+
EXPECT_EQ(table->num_rows(), 0);
64+
EXPECT_EQ(table->num_columns(), 0);
65+
}
66+
67+
TEST_F(TimezoneTransitionTableTest, EmptyZoneNameShortCircuitsWithoutReadingFile)
68+
{
69+
auto const table = cudf::make_timezone_transition_table(std::nullopt, "");
70+
EXPECT_EQ(table->num_rows(), 0);
71+
EXPECT_EQ(table->num_columns(), 0);
72+
}
73+
74+
TEST_F(TimezoneTransitionTableTest, CanonicalZoneProducesTwoColumnTable)
75+
{
76+
auto const tz_dir = find_system_tz_dir();
77+
if (!tz_dir) { GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name; }
78+
79+
auto const table = cudf::make_timezone_transition_table(tz_dir->string(), canonical_zone_name);
80+
ASSERT_EQ(table->num_columns(), 2);
81+
// Sanity: the future cycle dominates the row count, so we expect hundreds of rows.
82+
EXPECT_GT(table->num_rows(), 100);
83+
}
84+
85+
TEST_F(TimezoneTransitionTableTest, UnknownZoneThrows)
86+
{
87+
auto const tz_dir = find_system_tz_dir();
88+
if (!tz_dir) { GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name; }
89+
90+
EXPECT_THROW(cudf::make_timezone_transition_table(tz_dir->string(), "Not_A/Real_Zone_bXYZ"),
91+
cudf::logic_error);
92+
}
93+
94+
class TimezoneAliasResolutionTest : public cudf::test::BaseFixture {
95+
protected:
96+
void SetUp() override
97+
{
98+
// make the directory name process-unique
99+
auto const tmpl =
100+
(std::filesystem::temp_directory_path() / (std::string{"cudf_tz_alias_test_"} + ".XXXXXX"))
101+
.string();
102+
std::vector<char> buf(tmpl.begin(), tmpl.end());
103+
buf.push_back('\0');
104+
ASSERT_NE(::mkdtemp(buf.data()), nullptr) << "mkdtemp failed: " << std::strerror(errno);
105+
tz_dir_ = buf.data();
106+
}
107+
108+
void TearDown() override
109+
{
110+
std::error_code ec;
111+
std::filesystem::remove_all(tz_dir_, ec);
112+
}
113+
114+
[[nodiscard]] bool install_zone(std::string_view zone_name) const
115+
{
116+
auto const src_dir = find_system_tz_dir();
117+
if (!src_dir) { return false; }
118+
119+
std::error_code ec;
120+
auto const dst = tz_dir_ / zone_name;
121+
std::filesystem::create_directories(dst.parent_path(), ec);
122+
if (ec) {
123+
ADD_FAILURE() << "create_directories(" << dst.parent_path() << ") failed: " << ec.message();
124+
return false;
125+
}
126+
std::filesystem::copy_file(
127+
*src_dir / canonical_zone_name, dst, std::filesystem::copy_options::overwrite_existing, ec);
128+
if (ec) {
129+
ADD_FAILURE() << "copy_file(" << (*src_dir / canonical_zone_name) << " -> " << dst
130+
<< ") failed: " << ec.message();
131+
return false;
132+
}
133+
return true;
134+
}
135+
136+
void write_tzdata_zi(std::string_view contents) const
137+
{
138+
std::ofstream{tz_dir_ / "tzdata.zi"} << contents;
139+
}
140+
141+
[[nodiscard]] std::string dir() const { return tz_dir_.string(); }
142+
143+
private:
144+
std::filesystem::path tz_dir_;
145+
};
146+
147+
TEST_F(TimezoneAliasResolutionTest, DirectLookupUnaffectedByNewFallback)
148+
{
149+
if (!install_zone(canonical_zone_name)) {
150+
GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name;
151+
}
152+
153+
auto const table = cudf::make_timezone_transition_table(dir(), canonical_zone_name);
154+
EXPECT_GT(table->num_rows(), 0);
155+
EXPECT_EQ(table->num_columns(), 2);
156+
}
157+
158+
TEST_F(TimezoneAliasResolutionTest, ResolvesShortFormLinkFromTzdataZi)
159+
{
160+
if (!install_zone(canonical_zone_name)) {
161+
GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name;
162+
}
163+
write_tzdata_zi(
164+
"# synthetic tzdata.zi for libcudf tests\n"
165+
"L America/Los_Angeles US/Pacific\n");
166+
167+
auto const via_canonical = cudf::make_timezone_transition_table(dir(), canonical_zone_name);
168+
auto const via_alias = cudf::make_timezone_transition_table(dir(), "US/Pacific");
169+
CUDF_TEST_EXPECT_TABLES_EQUAL(via_canonical->view(), via_alias->view());
170+
}
171+
172+
TEST_F(TimezoneAliasResolutionTest, ResolvesLongFormLinkFromTzdataZi)
173+
{
174+
if (!install_zone(canonical_zone_name)) {
175+
GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name;
176+
}
177+
write_tzdata_zi("Link America/Los_Angeles US/Pacific\n");
178+
179+
auto const via_canonical = cudf::make_timezone_transition_table(dir(), canonical_zone_name);
180+
auto const via_alias = cudf::make_timezone_transition_table(dir(), "US/Pacific");
181+
CUDF_TEST_EXPECT_TABLES_EQUAL(via_canonical->view(), via_alias->view());
182+
}
183+
184+
TEST_F(TimezoneAliasResolutionTest, ResolvesChainedLinks)
185+
{
186+
if (!install_zone(canonical_zone_name)) {
187+
GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name;
188+
}
189+
// Neither "intermediate" nor "US/Pacific" exists on disk. The resolver must traverse
190+
// US/Pacific -> intermediate -> America/Los_Angeles to find a real file.
191+
write_tzdata_zi(
192+
"L America/Los_Angeles intermediate\n"
193+
"L intermediate US/Pacific\n");
194+
195+
auto const via_canonical = cudf::make_timezone_transition_table(dir(), canonical_zone_name);
196+
auto const via_alias = cudf::make_timezone_transition_table(dir(), "US/Pacific");
197+
CUDF_TEST_EXPECT_TABLES_EQUAL(via_canonical->view(), via_alias->view());
198+
}
199+
200+
TEST_F(TimezoneAliasResolutionTest, ThrowsWhenLinkTargetIsAlsoMissing)
201+
{
202+
// No zone files installed in tz_dir_.
203+
write_tzdata_zi("L Also/Missing US/Pacific\n");
204+
205+
EXPECT_THROW(cudf::make_timezone_transition_table(dir(), "US/Pacific"), cudf::logic_error);
206+
}
207+
208+
TEST_F(TimezoneAliasResolutionTest, ThrowsWhenNoTzdataZiPresent)
209+
{
210+
EXPECT_THROW(cudf::make_timezone_transition_table(dir(), "US/Pacific"), cudf::logic_error);
211+
}
212+
213+
TEST_F(TimezoneAliasResolutionTest, IgnoresCommentsAndNonLinkDirectives)
214+
{
215+
if (!install_zone(canonical_zone_name)) {
216+
GTEST_SKIP() << "No system zoneinfo directory with " << canonical_zone_name;
217+
}
218+
write_tzdata_zi(
219+
"# a leading comment\n"
220+
"\n"
221+
"R SomeRule 1970 o - Jan 1 0 0 S\n" // `Rule` entry, must be ignored
222+
"Z Fake/Zone 0 - LMT\n" // `Zone` entry, must be ignored
223+
" L America/Los_Angeles US/Pacific\n" // link with leading whitespace
224+
"# trailing comment\n");
225+
226+
auto const via_canonical = cudf::make_timezone_transition_table(dir(), canonical_zone_name);
227+
auto const via_alias = cudf::make_timezone_transition_table(dir(), "US/Pacific");
228+
CUDF_TEST_EXPECT_TABLES_EQUAL(via_canonical->view(), via_alias->view());
229+
}
230+
231+
CUDF_TEST_PROGRAM_MAIN()

0 commit comments

Comments
 (0)