diff --git a/tree/ntuple/CMakeLists.txt b/tree/ntuple/CMakeLists.txt index 5ceb4e3115345..24ab0556adb13 100644 --- a/tree/ntuple/CMakeLists.txt +++ b/tree/ntuple/CMakeLists.txt @@ -121,6 +121,13 @@ if(daos OR daos_mock) endif() endif() +# Enable RNTuple support for S3-compatible object storage +if(curl) + set(ROOTNTuple_EXTRA_HEADERS ${ROOTNTuple_EXTRA_HEADERS} ROOT/RPageStorageS3.hxx) + target_sources(ROOTNTuple PRIVATE src/RPageStorageS3.cxx) + target_link_libraries(ROOTNTuple PRIVATE nlohmann_json::nlohmann_json) +endif() + if(MSVC) target_compile_definitions(ROOTNTuple PRIVATE _USE_MATH_DEFINES) endif() diff --git a/tree/ntuple/inc/ROOT/RPageStorageS3.hxx b/tree/ntuple/inc/ROOT/RPageStorageS3.hxx new file mode 100644 index 0000000000000..33632c11c3d07 --- /dev/null +++ b/tree/ntuple/inc/ROOT/RPageStorageS3.hxx @@ -0,0 +1,74 @@ +/// \file ROOT/RPageStorageS3.hxx +/// \author Jas Mehta +/// \date 2026-06-01 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#ifndef ROOT_RPageStorageS3 +#define ROOT_RPageStorageS3 + +#include +#include + +#include +#include + +namespace ROOT { +namespace Experimental { +namespace Internal { + +// clang-format off +/** +\class ROOT::Experimental::Internal::RNTupleAnchorS3 +\ingroup NTuple +\brief Entry point for an RNTuple stored in S3-compatible object storage. + +The anchor is serialized as a JSON object and stored at the base URL of the ntuple. +It contains the information needed to locate and read the header and footer envelopes. +The anchor is always the last object written during CommitDatasetImpl, ensuring atomicity: +if the anchor exists, the entire ntuple is complete. +*/ +// clang-format on +struct RNTupleAnchorS3 { + /// Allows evolving the anchor JSON schema in future versions + std::uint32_t fVersionAnchor = 0; + /// Version of the RNTuple binary format supported by the writer + std::uint16_t fVersionEpoch = RNTuple::kVersionEpoch; + std::uint16_t fVersionMajor = RNTuple::kVersionMajor; + std::uint16_t fVersionMinor = RNTuple::kVersionMinor; + std::uint16_t fVersionPatch = RNTuple::kVersionPatch; + /// Pattern for resolving object IDs to full S3 URLs. + /// ${baseurl} is replaced with the anchor URL, ${objid} with the numeric object ID. + std::string fUrlTemplate; + /// Object ID and byte offset of the compressed header within the S3 object + std::uint64_t fHeaderObjId = 0; + std::uint64_t fHeaderOffset = 0; + /// Compressed and uncompressed sizes of the header envelope + std::uint64_t fNBytesHeader = 0; + std::uint64_t fLenHeader = 0; + /// Object ID and byte offset of the compressed footer within the S3 object + std::uint64_t fFooterObjId = 0; + std::uint64_t fFooterOffset = 0; + /// Compressed and uncompressed sizes of the footer envelope + std::uint64_t fNBytesFooter = 0; + std::uint64_t fLenFooter = 0; + + bool operator==(const RNTupleAnchorS3 &other) const; + + /// Serialize the anchor to a JSON string suitable for storage at the base URL + std::string ToJSON() const; + /// Deserialize the anchor from a JSON string. Returns an error on malformed or incompatible input. + static RResult CreateFromJSON(const std::string &json); +}; + +} // namespace Internal +} // namespace Experimental +} // namespace ROOT + +#endif diff --git a/tree/ntuple/src/RPageStorageS3.cxx b/tree/ntuple/src/RPageStorageS3.cxx new file mode 100644 index 0000000000000..0817eecc56a7c --- /dev/null +++ b/tree/ntuple/src/RPageStorageS3.cxx @@ -0,0 +1,99 @@ +/// \file RPageStorageS3.cxx +/// \author Jas Mehta +/// \date 2026-06-01 + +/************************************************************************* + * Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. * + * All rights reserved. * + * * + * For the licensing terms see $ROOTSYS/LICENSE. * + * For the list of contributors see $ROOTSYS/README/CREDITS. * + *************************************************************************/ + +#include + +#include + +#include + +/// Field-by-field equality check across all 14 anchor members. +/// Used to verify round-trip correctness in tests. +bool ROOT::Experimental::Internal::RNTupleAnchorS3::operator==(const RNTupleAnchorS3 &other) const +{ + return fVersionAnchor == other.fVersionAnchor && fVersionEpoch == other.fVersionEpoch && + fVersionMajor == other.fVersionMajor && fVersionMinor == other.fVersionMinor && + fVersionPatch == other.fVersionPatch && fUrlTemplate == other.fUrlTemplate && + fHeaderObjId == other.fHeaderObjId && fHeaderOffset == other.fHeaderOffset && + fNBytesHeader == other.fNBytesHeader && fLenHeader == other.fLenHeader && + fFooterObjId == other.fFooterObjId && fFooterOffset == other.fFooterOffset && + fNBytesFooter == other.fNBytesFooter && fLenFooter == other.fLenFooter; +} + +/// Serialize the anchor to a pretty-printed JSON string (2-space indent). +/// nlohmann/json handles type conversion, string escaping, and uint64 precision. +/// The output is suitable for direct upload to S3 as the anchor object. +std::string ROOT::Experimental::Internal::RNTupleAnchorS3::ToJSON() const +{ + nlohmann::json jsonAnchor; + jsonAnchor["anchorVersion"] = fVersionAnchor; + jsonAnchor["formatVersionEpoch"] = fVersionEpoch; + jsonAnchor["formatVersionMajor"] = fVersionMajor; + jsonAnchor["formatVersionMinor"] = fVersionMinor; + jsonAnchor["formatVersionPatch"] = fVersionPatch; + jsonAnchor["urlTemplate"] = fUrlTemplate; + jsonAnchor["headerObjId"] = fHeaderObjId; + jsonAnchor["headerOffset"] = fHeaderOffset; + jsonAnchor["nBytesHeader"] = fNBytesHeader; + jsonAnchor["lenHeader"] = fLenHeader; + jsonAnchor["footerObjId"] = fFooterObjId; + jsonAnchor["footerOffset"] = fFooterOffset; + jsonAnchor["nBytesFooter"] = fNBytesFooter; + jsonAnchor["lenFooter"] = fLenFooter; + return jsonAnchor.dump(2); +} + +/// Construct an anchor from a JSON string. +/// The anchor version is checked first; if it does not match the current version, +/// parsing fails immediately. All remaining fields are extracted with jsonAnchor.at() +/// which throws on missing keys or type mismatches. +ROOT::RResult +ROOT::Experimental::Internal::RNTupleAnchorS3::CreateFromJSON(const std::string &json) +{ + nlohmann::json jsonAnchor; + try { + jsonAnchor = nlohmann::json::parse(json); + } catch (const nlohmann::json::parse_error &e) { + return R__FAIL("cannot parse S3 anchor JSON: " + std::string(e.what())); + } + + RNTupleAnchorS3 anchor; + + try { + anchor.fVersionAnchor = jsonAnchor.at("anchorVersion").get(); + } catch (const nlohmann::json::exception &e) { + return R__FAIL("missing or invalid 'anchorVersion' in S3 anchor: " + std::string(e.what())); + } + + if (anchor.fVersionAnchor != RNTupleAnchorS3().fVersionAnchor) + return R__FAIL("unsupported S3 anchor version: " + std::to_string(anchor.fVersionAnchor)); + + try { + anchor.fVersionEpoch = jsonAnchor.at("formatVersionEpoch").get(); + anchor.fVersionMajor = jsonAnchor.at("formatVersionMajor").get(); + anchor.fVersionMinor = jsonAnchor.at("formatVersionMinor").get(); + anchor.fVersionPatch = jsonAnchor.at("formatVersionPatch").get(); + anchor.fUrlTemplate = jsonAnchor.at("urlTemplate").get(); + anchor.fHeaderObjId = jsonAnchor.at("headerObjId").get(); + anchor.fHeaderOffset = jsonAnchor.at("headerOffset").get(); + anchor.fNBytesHeader = jsonAnchor.at("nBytesHeader").get(); + anchor.fLenHeader = jsonAnchor.at("lenHeader").get(); + anchor.fFooterObjId = jsonAnchor.at("footerObjId").get(); + anchor.fFooterOffset = jsonAnchor.at("footerOffset").get(); + anchor.fNBytesFooter = jsonAnchor.at("nBytesFooter").get(); + anchor.fLenFooter = jsonAnchor.at("lenFooter").get(); + } catch (const nlohmann::json::exception &e) { + return R__FAIL("missing or invalid field in S3 anchor: " + std::string(e.what())); + } + + return anchor; +} diff --git a/tree/ntuple/test/CMakeLists.txt b/tree/ntuple/test/CMakeLists.txt index 34e8b373149be..7534cf4544efa 100644 --- a/tree/ntuple/test/CMakeLists.txt +++ b/tree/ntuple/test/CMakeLists.txt @@ -176,6 +176,10 @@ if(daos OR daos_mock) endif() endif() +if(curl) + ROOT_ADD_GTEST(ntuple_storage_s3 ntuple_storage_s3.cxx LIBRARIES ROOTNTuple) +endif() + # RNTuple Python interface tests if(pyroot) diff --git a/tree/ntuple/test/ntuple_storage_s3.cxx b/tree/ntuple/test/ntuple_storage_s3.cxx new file mode 100644 index 0000000000000..e145ef9f6eee0 --- /dev/null +++ b/tree/ntuple/test/ntuple_storage_s3.cxx @@ -0,0 +1,351 @@ +/// \file ntuple_storage_s3.cxx +/// \author Jas Mehta +/// \date 2026-06-01 +/// \brief Unit tests for the S3 storage backend components (anchor serialization). + +#include "ntuple_test.hxx" +#include + +using RNTupleAnchorS3 = ROOT::Experimental::Internal::RNTupleAnchorS3; + +// ==================== RNTupleAnchorS3 Tests ==================== + +TEST(RNTupleAnchorS3, RoundTrip) +{ + RNTupleAnchorS3 orig; + orig.fVersionAnchor = 0; + orig.fVersionEpoch = 1; + orig.fVersionMajor = 0; + orig.fVersionMinor = 2; + orig.fVersionPatch = 0; + orig.fUrlTemplate = "https://bucket.s3.us-east-1.amazonaws.com/data/${objid}"; + orig.fHeaderObjId = 1; + orig.fHeaderOffset = 0; + orig.fNBytesHeader = 1200; + orig.fLenHeader = 4096; + orig.fFooterObjId = 42; + orig.fFooterOffset = 0; + orig.fNBytesFooter = 800; + orig.fLenFooter = 2048; + + auto json = orig.ToJSON(); + EXPECT_FALSE(json.empty()); + + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + const auto &parsed = result.Inspect(); + + EXPECT_EQ(orig, parsed); + EXPECT_EQ(0u, parsed.fVersionAnchor); + EXPECT_EQ(1u, parsed.fVersionEpoch); + EXPECT_EQ(0u, parsed.fVersionMajor); + EXPECT_EQ(2u, parsed.fVersionMinor); + EXPECT_EQ(0u, parsed.fVersionPatch); + EXPECT_EQ("https://bucket.s3.us-east-1.amazonaws.com/data/${objid}", parsed.fUrlTemplate); + EXPECT_EQ(1u, parsed.fHeaderObjId); + EXPECT_EQ(0u, parsed.fHeaderOffset); + EXPECT_EQ(1200u, parsed.fNBytesHeader); + EXPECT_EQ(4096u, parsed.fLenHeader); + EXPECT_EQ(42u, parsed.fFooterObjId); + EXPECT_EQ(0u, parsed.fFooterOffset); + EXPECT_EQ(800u, parsed.fNBytesFooter); + EXPECT_EQ(2048u, parsed.fLenFooter); +} + +TEST(RNTupleAnchorS3, UnsupportedVersion) +{ + std::string json = R"({"anchorVersion": 99, "formatVersionEpoch": 1})"; + auto result = RNTupleAnchorS3::CreateFromJSON(json); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, MissingField) +{ + // Valid JSON but missing footer fields + std::string json = R"({ + "anchorVersion": 0, + "formatVersionEpoch": 1, + "formatVersionMajor": 0, + "formatVersionMinor": 2, + "formatVersionPatch": 0, + "urlTemplate": "test", + "headerObjId": 1, + "headerOffset": 0, + "nBytesHeader": 100, + "lenHeader": 200 + })"; + auto result = RNTupleAnchorS3::CreateFromJSON(json); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, SpecialCharsInUrl) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "https://example.com/path/with\"quotes/${objid}"; + orig.fHeaderObjId = 1; + orig.fNBytesHeader = 100; + orig.fLenHeader = 200; + orig.fFooterObjId = 2; + orig.fNBytesFooter = 50; + orig.fLenFooter = 100; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + EXPECT_EQ(orig.fUrlTemplate, result.Inspect().fUrlTemplate); +} + +TEST(RNTupleAnchorS3, MalformedJson) +{ + auto result = RNTupleAnchorS3::CreateFromJSON("not json at all"); + EXPECT_FALSE(bool(result)); + + result = RNTupleAnchorS3::CreateFromJSON("{incomplete"); + EXPECT_FALSE(bool(result)); + + result = RNTupleAnchorS3::CreateFromJSON(""); + EXPECT_FALSE(bool(result)); + + result = RNTupleAnchorS3::CreateFromJSON(" "); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, ExtraFieldsIgnored) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "${baseurl}/${objid}"; + orig.fHeaderObjId = 1; + orig.fNBytesHeader = 500; + orig.fLenHeader = 1000; + orig.fFooterObjId = 10; + orig.fNBytesFooter = 300; + orig.fLenFooter = 600; + + auto json = orig.ToJSON(); + // Inject an unknown field before the closing brace + auto pos = json.rfind('}'); + json.insert(pos, ",\n \"future_field\": 999"); + + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + EXPECT_EQ(orig, result.Inspect()); +} + +TEST(RNTupleAnchorS3, LargeObjectIds) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "${baseurl}/${objid}"; + orig.fHeaderObjId = 4294967296ULL; // 2^32 -- beyond uint32 range + orig.fHeaderOffset = 0; + orig.fNBytesHeader = 100; + orig.fLenHeader = 200; + orig.fFooterObjId = 9007199254740993ULL; // 2^53 + 1 -- beyond double precision + orig.fFooterOffset = 1099511627776ULL; // 2^40 + orig.fNBytesFooter = 50; + orig.fLenFooter = 100; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + const auto &parsed = result.Inspect(); + EXPECT_EQ(4294967296ULL, parsed.fHeaderObjId); + EXPECT_EQ(9007199254740993ULL, parsed.fFooterObjId); + EXPECT_EQ(1099511627776ULL, parsed.fFooterOffset); +} + +TEST(RNTupleAnchorS3, DefaultValues) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "${baseurl}/${objid}"; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + const auto &parsed = result.Inspect(); + EXPECT_EQ(0u, parsed.fHeaderObjId); + EXPECT_EQ(0u, parsed.fNBytesHeader); + EXPECT_EQ(0u, parsed.fLenHeader); + EXPECT_EQ(0u, parsed.fFooterObjId); + EXPECT_EQ(0u, parsed.fNBytesFooter); + EXPECT_EQ(0u, parsed.fLenFooter); +} + +TEST(RNTupleAnchorS3, BackslashInUrl) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "C:\\Users\\data\\${objid}"; + orig.fHeaderObjId = 1; + orig.fNBytesHeader = 100; + orig.fLenHeader = 200; + orig.fFooterObjId = 2; + orig.fNBytesFooter = 50; + orig.fLenFooter = 100; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + EXPECT_EQ("C:\\Users\\data\\${objid}", result.Inspect().fUrlTemplate); +} + +TEST(RNTupleAnchorS3, MissingAnchorVersion) +{ + std::string json = R"({ + "formatVersionEpoch": 1, + "formatVersionMajor": 0, + "formatVersionMinor": 0, + "formatVersionPatch": 0, + "urlTemplate": "test", + "headerObjId": 1, + "headerOffset": 0, + "nBytesHeader": 100, + "lenHeader": 200, + "footerObjId": 2, + "footerOffset": 0, + "nBytesFooter": 50, + "lenFooter": 100 + })"; + auto result = RNTupleAnchorS3::CreateFromJSON(json); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, Equality) +{ + RNTupleAnchorS3 a; + a.fUrlTemplate = "${baseurl}/${objid}"; + a.fHeaderObjId = 1; + a.fNBytesHeader = 100; + a.fLenHeader = 200; + a.fFooterObjId = 2; + a.fNBytesFooter = 50; + a.fLenFooter = 100; + + RNTupleAnchorS3 b = a; + EXPECT_EQ(a, b); + + b.fHeaderObjId = 99; + EXPECT_FALSE(a == b); +} + +TEST(RNTupleAnchorS3, ToJSONProducesValidJson) +{ + RNTupleAnchorS3 anchor; + anchor.fUrlTemplate = "${baseurl}/${objid}"; + anchor.fHeaderObjId = 5; + anchor.fNBytesHeader = 500; + anchor.fLenHeader = 1000; + anchor.fFooterObjId = 10; + anchor.fNBytesFooter = 300; + anchor.fLenFooter = 600; + + auto json = anchor.ToJSON(); + + // Basic structural checks for valid JSON + EXPECT_EQ('{', json.front()); + EXPECT_EQ('}', json.back()); + EXPECT_NE(std::string::npos, json.find("\"anchorVersion\"")); + EXPECT_NE(std::string::npos, json.find("\"formatVersionEpoch\"")); + EXPECT_NE(std::string::npos, json.find("\"urlTemplate\"")); + EXPECT_NE(std::string::npos, json.find("\"headerObjId\"")); + EXPECT_NE(std::string::npos, json.find("\"footerObjId\"")); + EXPECT_NE(std::string::npos, json.find("\"nBytesHeader\"")); + EXPECT_NE(std::string::npos, json.find("\"lenHeader\"")); + EXPECT_NE(std::string::npos, json.find("\"nBytesFooter\"")); + EXPECT_NE(std::string::npos, json.find("\"lenFooter\"")); +} + +TEST(RNTupleAnchorS3, NewlinesAndTabsInUrl) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "https://example.com/path\twith\ttabs\nand\nnewlines/${objid}"; + orig.fHeaderObjId = 1; + orig.fNBytesHeader = 100; + orig.fLenHeader = 200; + orig.fFooterObjId = 2; + orig.fNBytesFooter = 50; + orig.fLenFooter = 100; + + auto json = orig.ToJSON(); + // Verify the JSON doesn't contain literal tabs/newlines inside the string value + // (they should be escaped as \t and \n) + auto urlPos = json.find("\"urlTemplate\""); + ASSERT_NE(std::string::npos, urlPos); + auto colonPos = json.find(':', urlPos); + auto openQuote = json.find('"', colonPos + 1); + auto closeQuote = openQuote + 1; + while (closeQuote < json.size() && json[closeQuote] != '"') { + if (json[closeQuote] == '\\') + ++closeQuote; // skip escaped char + ++closeQuote; + } + std::string rawUrlValue = json.substr(openQuote + 1, closeQuote - openQuote - 1); + // Should contain escaped sequences, not literal control chars + EXPECT_NE(std::string::npos, rawUrlValue.find("\\t")); + EXPECT_NE(std::string::npos, rawUrlValue.find("\\n")); + + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + EXPECT_EQ(orig.fUrlTemplate, result.Inspect().fUrlTemplate); +} + +TEST(RNTupleAnchorS3, WrongFieldType) +{ + // anchorVersion is a string instead of an integer + std::string json = R"({ + "anchorVersion": "not_a_number", + "formatVersionEpoch": 1 + })"; + auto result = RNTupleAnchorS3::CreateFromJSON(json); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, EmptyUrlTemplate) +{ + RNTupleAnchorS3 orig; + orig.fUrlTemplate = ""; + orig.fHeaderObjId = 1; + orig.fNBytesHeader = 100; + orig.fLenHeader = 200; + orig.fFooterObjId = 2; + orig.fNBytesFooter = 50; + orig.fLenFooter = 100; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + EXPECT_EQ("", result.Inspect().fUrlTemplate); +} + +TEST(RNTupleAnchorS3, JsonArray) +{ + // Valid JSON but wrong type (array, not object) + auto result = RNTupleAnchorS3::CreateFromJSON("[1, 2, 3]"); + EXPECT_FALSE(bool(result)); +} + +TEST(RNTupleAnchorS3, MaxUint64Values) +{ + // Test boundary values for all uint64 fields + RNTupleAnchorS3 orig; + orig.fUrlTemplate = "${baseurl}/${objid}"; + orig.fHeaderObjId = UINT64_MAX; + orig.fHeaderOffset = UINT64_MAX; + orig.fNBytesHeader = UINT64_MAX; + orig.fLenHeader = UINT64_MAX; + orig.fFooterObjId = UINT64_MAX; + orig.fFooterOffset = UINT64_MAX; + orig.fNBytesFooter = UINT64_MAX; + orig.fLenFooter = UINT64_MAX; + + auto json = orig.ToJSON(); + auto result = RNTupleAnchorS3::CreateFromJSON(json); + ASSERT_TRUE(bool(result)) << result.GetError()->GetReport(); + const auto &parsed = result.Inspect(); + EXPECT_EQ(UINT64_MAX, parsed.fHeaderObjId); + EXPECT_EQ(UINT64_MAX, parsed.fHeaderOffset); + EXPECT_EQ(UINT64_MAX, parsed.fNBytesHeader); + EXPECT_EQ(UINT64_MAX, parsed.fLenHeader); + EXPECT_EQ(UINT64_MAX, parsed.fFooterObjId); + EXPECT_EQ(UINT64_MAX, parsed.fFooterOffset); + EXPECT_EQ(UINT64_MAX, parsed.fNBytesFooter); + EXPECT_EQ(UINT64_MAX, parsed.fLenFooter); +}