diff --git a/CHANGELOG.md b/CHANGELOG.md index 1434fd45e..9965b75a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added + - Add OAI-PMH ## [[1.0.5]](https://github.com/thoth-pub/thoth/releases/tag/v1.0.5) - 2026-04-15 ### Added diff --git a/Cargo.lock b/Cargo.lock index 3d7313c38..51688dea1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -64,7 +64,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.3", "sha1", "smallvec", "tokio", @@ -316,9 +316,9 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "arc-swap" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -483,9 +483,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.128.0" +version = "1.129.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99304b64672e0d81a3c100a589b93d9ef5e9c0ce12e21c848fd39e50f493c2a1" +checksum = "6d4e8410fadbc0ee453145dd77a4958227b18b05bf67c2795d0a8b8596c9aa0f" dependencies = [ "aws-credential-types", "aws-runtime", @@ -998,9 +998,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.58" +version = "1.2.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e928d4b69e3077709075a938a05ffbedfa53a84c8f766efbf8220bb1ff60e1" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" dependencies = [ "find-msvc-tools", "jobserver", @@ -1017,6 +1017,12 @@ dependencies = [ "regex", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cfg-if" version = "1.0.4" @@ -1331,12 +1337,12 @@ checksum = "4f8a51dd197fa6ba5b4dc98a990a43cc13693c23eb0089ebb0fcc1f04152bca6" [[package]] name = "darling" -version = "0.21.3" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core 0.21.3", - "darling_macro 0.21.3", + "darling_core 0.20.11", + "darling_macro 0.20.11", ] [[package]] @@ -1351,9 +1357,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" dependencies = [ "fnv", "ident_case", @@ -1378,11 +1384,11 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core 0.21.3", + "darling_core 0.20.11", "quote", "syn 2.0.117", ] @@ -1520,15 +1526,14 @@ dependencies = [ [[package]] name = "diesel" -version = "2.3.7" +version = "2.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4ae09a41a4b89f94ec1e053623da8340d996bc32c6517d325a9daad9b239358" +checksum = "470eb10efc8646313634c99bb1593f402a6434cbd86e266770c6e39219adb86a" dependencies = [ "bitflags 2.11.0", "byteorder", "chrono", "diesel_derives", - "downcast-rs", "itoa", "pq-sys", "r2d2", @@ -1561,9 +1566,9 @@ dependencies = [ [[package]] name = "diesel_derives" -version = "2.3.7" +version = "2.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47618bf0fac06bb670c036e48404c26a865e6a71af4114dfd97dfe89936e404e" +checksum = "1b96984c469425cb577bf6f17121ecb3e4fe1e81de5d8f780dd372802858d756" dependencies = [ "diesel_table_macro_syntax", "dsl_auto_type", @@ -1574,9 +1579,9 @@ dependencies = [ [[package]] name = "diesel_migrations" -version = "2.3.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745fd255645f0f1135f9ec55c7b00e0882192af9683ab4731e4bba3da82b8f9c" +checksum = "8a73ce704bad4231f001bff3314d91dce4aba0770cee8b233991859abc15c1f6" dependencies = [ "diesel", "migrations_internals", @@ -1585,9 +1590,9 @@ dependencies = [ [[package]] name = "diesel_table_macro_syntax" -version = "0.3.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe2444076b48641147115697648dc743c2c00b61adade0f01ce67133c7babe8c" +checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ "syn 2.0.117", ] @@ -1621,19 +1626,13 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" -[[package]] -name = "downcast-rs" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" - [[package]] name = "dsl_auto_type" -version = "0.2.0" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd122633e4bef06db27737f21d3738fb89c8f6d5360d6d9d7635dda142a7757e" +checksum = "139ae9aca7527f85f26dd76483eb38533fd84bd571065da1739656ef71c5ff5b" dependencies = [ - "darling 0.21.3", + "darling 0.20.11", "either", "heck 0.5.0", "proc-macro2", @@ -1827,9 +1826,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "ff" @@ -2196,7 +2195,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.13.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -2215,7 +2214,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -2248,6 +2247,12 @@ dependencies = [ "foldhash 0.2.0", ] +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + [[package]] name = "heck" version = "0.4.1" @@ -2395,16 +2400,15 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "c2b52f86d1d4bc0d6b4e6826d960b1b333217e07d36b882dca570a5e1c48895b" dependencies = [ "http 1.4.0", "hyper", "hyper-util", "rustls", "rustls-native-certs", - "rustls-pki-types", "tokio", "tokio-rustls", "tower-service", @@ -2623,12 +2627,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -2726,6 +2730,50 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys 0.3.1", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -2738,9 +2786,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.94" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" dependencies = [ "cfg-if", "futures-util", @@ -2777,7 +2825,7 @@ dependencies = [ "fnv", "futures", "graphql-parser", - "indexmap 2.13.0", + "indexmap 2.14.0", "juniper_codegen", "serde", "smartstring", @@ -2821,9 +2869,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.184" +version = "0.2.185" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" +checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" [[package]] name = "libm" @@ -2943,9 +2991,9 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "migrations_internals" -version = "2.3.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c791ecdf977c99f45f23280405d7723727470f6689a5e6dbf513ac547ae10d" +checksum = "3bda1634d70d5bd53553cf15dca9842a396e8c799982a3ad22998dc44d961f24" dependencies = [ "serde", "toml", @@ -2953,9 +3001,9 @@ dependencies = [ [[package]] name = "migrations_macros" -version = "2.3.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36fc5ac76be324cfd2d3f2cf0fdf5d5d3c4f14ed8aaebadb09e304ba42282703" +checksum = "ffb161cc72176cb37aa47f1fc520d3ef02263d67d661f44f05d05a079e1237fd" dependencies = [ "migrations_internals", "proc-macro2", @@ -3101,6 +3149,23 @@ dependencies = [ "libc", ] +[[package]] +name = "oai-pmh-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34323bda5080698bb8595cf2019117a955ba8fdcb7fe8cf349d2ed4982b2957e" +dependencies = [ + "actix-web", + "async-trait", + "base64 0.22.1", + "chrono", + "flate2", + "quick-xml 0.39.2", + "serde", + "serde_json", + "url", +] + [[package]] name = "oauth2" version = "5.0.0" @@ -3112,7 +3177,7 @@ dependencies = [ "getrandom 0.2.17", "http 1.4.0", "rand 0.8.5", - "reqwest", + "reqwest 0.12.28", "serde", "serde_json", "serde_path_to_error", @@ -3139,7 +3204,7 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f21aa89c0b45d63c9a4976b0de5dcf4e041defc2cd9720820f0012f0046a0bc" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.14.0", "serde", "serde_json", ] @@ -3177,9 +3242,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.76" +version = "0.10.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" +checksum = "bfe4646e360ec77dff7dde40ed3d6c5fee52d156ef4a62f53973d38294dad87f" dependencies = [ "bitflags 2.11.0", "cfg-if", @@ -3209,9 +3274,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "openssl-sys" -version = "0.9.112" +version = "0.9.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" +checksum = "ad2f2c0eba47118757e4c6d2bff2838f3e0523380021356e7875e858372ce644" dependencies = [ "cc", "libc", @@ -3464,7 +3529,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.14.0", ] [[package]] @@ -3622,9 +3687,9 @@ dependencies = [ [[package]] name = "pkg-config" -version = "0.3.32" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "portable-atomic" @@ -3748,7 +3813,7 @@ version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "itertools 0.14.0", "log", "multimap", @@ -3811,6 +3876,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.39.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.9" @@ -3837,10 +3911,11 @@ version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ + "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.3", "ring", "rustc-hash", "rustls", @@ -3911,9 +3986,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -4095,6 +4170,46 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "reqwest" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" +dependencies = [ + "base64 0.22.1", + "bytes", + "encoding_rs", + "futures-core", + "h2 0.4.13", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "mime", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "rustls-platform-verifier", + "serde", + "serde_json", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "reqwest-middleware" version = "0.4.2" @@ -4104,7 +4219,7 @@ dependencies = [ "anyhow", "async-trait", "http 1.4.0", - "reqwest", + "reqwest 0.12.28", "serde", "thiserror 1.0.69", "tower-service", @@ -4123,7 +4238,7 @@ dependencies = [ "http 1.4.0", "hyper", "parking_lot 0.11.2", - "reqwest", + "reqwest 0.12.28", "reqwest-middleware", "retry-policies", "thiserror 1.0.69", @@ -4235,9 +4350,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "69f9466fb2c14ea04357e91413efb882e2a6d4a406e625449bc0a5d360d53a21" dependencies = [ "aws-lc-rs", "log", @@ -4271,11 +4386,38 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" +dependencies = [ + "core-foundation 0.10.1", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4" dependencies = [ "aws-lc-rs", "ring", @@ -4295,6 +4437,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.29" @@ -4431,9 +4582,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -4539,7 +4690,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.13.0", + "indexmap 2.14.0", "schemars 0.9.0", "schemars 1.2.1", "serde_core", @@ -4566,7 +4717,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.14.0", "itoa", "ryu", "serde", @@ -4980,6 +5131,7 @@ dependencies = [ "thoth-api-server", "thoth-errors", "thoth-export-server", + "thoth-oai-server", "tokio", "zitadel", ] @@ -5010,8 +5162,8 @@ dependencies = [ "lazy_static", "log", "pulldown-cmark", - "quick-xml", - "rand 0.9.2", + "quick-xml 0.36.2", + "rand 0.9.3", "regex", "scraper", "serde", @@ -5046,7 +5198,7 @@ version = "1.0.5" dependencies = [ "chrono", "graphql_client", - "reqwest", + "reqwest 0.12.28", "reqwest-middleware", "reqwest-retry", "serde", @@ -5069,7 +5221,7 @@ dependencies = [ "juniper", "marc", "phf 0.11.3", - "reqwest", + "reqwest 0.12.28", "reqwest-middleware", "serde", "serde_json", @@ -5095,7 +5247,7 @@ dependencies = [ "log", "marc", "paperclip", - "quick-xml", + "quick-xml 0.36.2", "regex", "serde", "serde_json", @@ -5106,6 +5258,27 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "thoth-oai-server" +version = "1.0.5" +dependencies = [ + "actix-cors", + "actix-web", + "async-trait", + "base64 0.22.1", + "chrono", + "env_logger", + "flate2", + "oai-pmh-rs", + "quick-xml 0.39.2", + "reqwest 0.13.2", + "serde_json", + "thoth-api", + "thoth-client", + "thoth-errors", + "uuid", +] + [[package]] name = "time" version = "0.3.47" @@ -5164,9 +5337,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" dependencies = [ "bytes", "libc", @@ -5181,9 +5354,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -5240,10 +5413,12 @@ version = "0.9.12+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" dependencies = [ + "indexmap 2.14.0", "serde_core", "serde_spanned", "toml_datetime", "toml_parser", + "toml_writer", "winnow 0.7.15", ] @@ -5265,6 +5440,12 @@ dependencies = [ "winnow 1.0.1", ] +[[package]] +name = "toml_writer" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" + [[package]] name = "tonic" version = "0.14.5" @@ -5315,7 +5496,7 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap 2.13.0", + "indexmap 2.14.0", "pin-project-lite", "slab", "sync_wrapper", @@ -5521,6 +5702,16 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -5556,9 +5747,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", "once_cell", @@ -5569,9 +5760,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.67" +version = "0.4.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" +checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8" dependencies = [ "js-sys", "wasm-bindgen", @@ -5579,9 +5770,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5589,9 +5780,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ "bumpalo", "proc-macro2", @@ -5602,9 +5793,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.117" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" dependencies = [ "unicode-ident", ] @@ -5626,7 +5817,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.13.0", + "indexmap 2.14.0", "wasm-encoder", "wasmparser", ] @@ -5654,15 +5845,15 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags 2.11.0", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.14.0", "semver", ] [[package]] name = "web-sys" -version = "0.3.94" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" dependencies = [ "js-sys", "wasm-bindgen", @@ -5678,6 +5869,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "webpki-roots" version = "1.0.6" @@ -5703,6 +5903,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -5779,6 +5988,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -5815,6 +6033,21 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -5848,6 +6081,12 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -5860,6 +6099,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -5872,6 +6117,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -5896,6 +6147,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -5908,6 +6165,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -5920,6 +6183,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -5932,6 +6201,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -5984,7 +6259,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck 0.5.0", - "indexmap 2.13.0", + "indexmap 2.14.0", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -6015,7 +6290,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags 2.11.0", - "indexmap 2.13.0", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -6034,7 +6309,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.13.0", + "indexmap 2.14.0", "log", "semver", "serde", @@ -6046,9 +6321,9 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "xml-rs" @@ -6179,7 +6454,7 @@ dependencies = [ "pbjson-types", "prost", "prost-types", - "reqwest", + "reqwest 0.12.28", "serde", "serde_json", "serde_urlencoded", diff --git a/Cargo.toml b/Cargo.toml index 962e1c128..4fb29b08b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,13 +12,21 @@ readme = "README.md" maintenance = { status = "actively-developed" } [workspace] -members = ["thoth-api", "thoth-api-server", "thoth-client", "thoth-errors", "thoth-export-server"] +members = [ + "thoth-api", + "thoth-api-server", + "thoth-client", + "thoth-errors", + "thoth-export-server", + "thoth-oai-server", +] [dependencies] thoth-api = { version = "=1.0.5", path = "thoth-api", features = ["backend"] } thoth-api-server = { version = "=1.0.5", path = "thoth-api-server" } thoth-errors = { version = "=1.0.5", path = "thoth-errors" } thoth-export-server = { version = "=1.0.5", path = "thoth-export-server" } +thoth-oai-server = { version = "=1.0.5", path = "thoth-oai-server" } base64 = "0.22.1" clap = { version = "4.5.32", features = ["cargo", "env"] } dialoguer = { version = "0.11.0", features = ["password"] } diff --git a/Makefile b/Makefile index a073d2a30..94bec8ca6 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ run-zitadel \ run-graphql-api \ run-export-api \ + run-oai-api \ build \ test \ check \ @@ -31,6 +32,7 @@ help: @echo " run-zitadel Start Zitadel (docker)" @echo " run-graphql-api Run GraphQL API (cargo)" @echo " run-export-api Run export API (cargo)" + @echo " run-oai-api Run OAI-PMH API (cargo)" @echo " build Build the workspace" @echo " test Run tests" @echo " coverage Run test coverage (cargo llvm-cov)" @@ -59,6 +61,9 @@ run-graphql-api: build run-export-api: build RUST_BACKTRACE=1 cargo run start export-api +run-oai-api: build + RUST_BACKTRACE=1 cargo run start oai-api + build: cargo build -vv @@ -89,4 +94,3 @@ migration: mkdir -p $$dir; \ touch $$dir/up.sql; \ touch $$dir/down.sql; - diff --git a/src/bin/arguments/mod.rs b/src/bin/arguments/mod.rs index 236948101..64df0eed2 100644 --- a/src/bin/arguments/mod.rs +++ b/src/bin/arguments/mod.rs @@ -85,6 +85,28 @@ pub fn export_url() -> Arg { .num_args(1) } +pub fn oai_url() -> Arg { + Arg::new("oai-url") + .short('o') + .long("oai-url") + .value_name("THOTH_OAI_API") + .env("THOTH_OAI_API") + .default_value("http://localhost:8383") + .help("Thoth OAI-PMH API's, public facing, root URL.") + .num_args(1) +} + +pub fn oai_retry_after_seconds() -> Arg { + Arg::new("oai-retry-after-seconds") + .long("oai-retry-after-seconds") + .value_name("OAI_API_RETRY_AFTER_SECONDS") + .env("OAI_API_RETRY_AFTER_SECONDS") + .default_value("30") + .help("Retry-After value in seconds used for transient upstream OAI failures") + .num_args(1) + .value_parser(value_parser!(u64)) +} + pub fn zitadel_url() -> Arg { Arg::new("zitadel-url") .short('z') diff --git a/src/bin/commands/start.rs b/src/bin/commands/start.rs index c557c857d..73d1a9dc4 100644 --- a/src/bin/commands/start.rs +++ b/src/bin/commands/start.rs @@ -1,7 +1,7 @@ use crate::arguments; use clap::{ArgMatches, Command}; use lazy_static::lazy_static; -use thoth::{api_server, errors::ThothResult, export_server}; +use thoth::{api_server, errors::ThothResult, export_server, oai_server}; lazy_static! { pub(crate) static ref COMMAND: Command = Command::new("start") @@ -33,6 +33,18 @@ lazy_static! { .arg(arguments::keep_alive("EXPORT_API_KEEP_ALIVE")) .arg(arguments::export_url()) .arg(arguments::gql_endpoint()), + ) + .subcommand( + Command::new("oai-api") + .about("Start the thoth OAI-PMH API") + .arg(arguments::host("OAI_API_HOST")) + .arg(arguments::port("8383", "OAI_API_PORT")) + .arg(arguments::threads("OAI_API_THREADS")) + .arg(arguments::keep_alive("OAI_API_KEEP_ALIVE")) + .arg(arguments::oai_url()) + .arg(arguments::gql_endpoint()) + .arg(arguments::oai_retry_after_seconds()) + .arg(arguments::export_url()), ); } @@ -98,3 +110,32 @@ pub fn export_api(arguments: &ArgMatches) -> ThothResult<()> { ) .map_err(|e| e.into()) } + +pub fn oai_api(arguments: &ArgMatches) -> ThothResult<()> { + let host = arguments.get_one::("host").unwrap().to_owned(); + let port = arguments.get_one::("port").unwrap().to_owned(); + let threads = *arguments.get_one::("threads").unwrap(); + let keep_alive = *arguments.get_one::("keep-alive").unwrap(); + let public_url = arguments.get_one::("oai-url").unwrap().to_owned(); + let gql_endpoint = arguments + .get_one::("gql-endpoint") + .unwrap() + .to_owned(); + let retry_after_seconds = *arguments.get_one::("oai-retry-after-seconds").unwrap(); + let export_url = arguments + .get_one::("export-url") + .unwrap() + .to_owned(); + + oai_server( + host, + port, + threads, + keep_alive, + public_url, + gql_endpoint, + export_url, + retry_after_seconds, + ) + .map_err(|e| e.into()) +} diff --git a/src/bin/thoth.rs b/src/bin/thoth.rs index 6ee60f80c..4db5fe212 100644 --- a/src/bin/thoth.rs +++ b/src/bin/thoth.rs @@ -23,6 +23,7 @@ fn main() -> thoth::errors::ThothResult<()> { Some(("start", start_arguments)) => match start_arguments.subcommand() { Some(("graphql-api", arguments)) => commands::start::graphql_api(arguments), Some(("export-api", arguments)) => commands::start::export_api(arguments), + Some(("oai-api", arguments)) => commands::start::oai_api(arguments), _ => unreachable!(), }, Some(("migrate", arguments)) => commands::migrate(arguments), diff --git a/src/lib.rs b/src/lib.rs index 10b035ce2..5ef4da6b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,3 +2,4 @@ pub use thoth_api as api; pub use thoth_api_server::start_server as api_server; pub use thoth_errors as errors; pub use thoth_export_server::{start_server as export_server, ALL_SPECIFICATIONS}; +pub use thoth_oai_server::start_server as oai_server; diff --git a/thoth-api/Cargo.toml b/thoth-api/Cargo.toml index dd564117e..a4a76afe5 100644 --- a/thoth-api/Cargo.toml +++ b/thoth-api/Cargo.toml @@ -36,10 +36,10 @@ actix-web = { version = "4.10", optional = true } isbn = "0.6.0" chrono = { version = "0.4.40", features = ["serde"] } deadpool-redis = { version = "0.20.0", optional = true } -diesel = { version = "2.2.8", features = ["postgres", "uuid", "chrono", "r2d2", "64-column-tables", "serde_json"], optional = true } -diesel-derive-enum = { version = "2.1.0", features = ["postgres"], optional = true } -diesel-derive-newtype = "2.1.2" -diesel_migrations = { version = "2.2.0", features = ["postgres"], optional = true } +diesel = { version = "=2.2.8", features = ["postgres", "uuid", "chrono", "r2d2", "64-column-tables", "serde_json"], optional = true } +diesel-derive-enum = { version = "=2.1.0", features = ["postgres"], optional = true } +diesel-derive-newtype = "=2.1.2" +diesel_migrations = { version = "=2.2.0", features = ["postgres"], optional = true } dotenv = "0.15.0" futures = { version = "0.3.31", optional = true } jsonwebtoken = { version = "10.3.0", optional = true } diff --git a/thoth-api/src/model/work_relation/crud.rs b/thoth-api/src/model/work_relation/crud.rs index 6fbe565dd..27c45b099 100644 --- a/thoth-api/src/model/work_relation/crud.rs +++ b/thoth-api/src/model/work_relation/crud.rs @@ -5,8 +5,8 @@ use super::{ use crate::model::{Crud, DbInsert, HistoryEntry, PublisherId, Reorder}; use crate::schema::{work_relation, work_relation_history}; use diesel::{ - dsl::max, sql_query, sql_types::Text, BoolExpressionMethods, Connection, ExpressionMethods, - QueryDsl, RunQueryDsl, + sql_query, sql_types::Text, BoolExpressionMethods, Connection, ExpressionMethods, QueryDsl, + RunQueryDsl, }; use thoth_errors::{ThothError, ThothResult}; use uuid::Uuid; @@ -142,7 +142,7 @@ impl Crud for WorkRelation { // This will return `None` if no records with this work and type already exist. let max_inverse_ordinal = work_relation::table - .select(max(work_relation::relation_ordinal)) + .select(diesel::dsl::max(work_relation::relation_ordinal)) .filter(work_relation::relator_work_id.eq(data.related_work_id).and( work_relation::relation_type.eq(data.relation_type.convert_to_inverse()), )) diff --git a/thoth-client/Cargo.toml b/thoth-client/Cargo.toml index 5974cfcee..1575883d3 100644 --- a/thoth-client/Cargo.toml +++ b/thoth-client/Cargo.toml @@ -10,7 +10,7 @@ readme = "README.md" build = "build.rs" [dependencies] -thoth-api = {version = "=1.0.5", path = "../thoth-api" } +thoth-api = {version = "=1.0.5", path = "../thoth-api" , features = ["backend"] } thoth-errors = {version = "=1.0.5", path = "../thoth-errors" } graphql_client = "0.14.0" chrono = { version = "0.4.40", features = ["serde"] } diff --git a/thoth-client/assets/queries.graphql b/thoth-client/assets/queries.graphql index 68d8b6bbd..a858935c6 100644 --- a/thoth-client/assets/queries.graphql +++ b/thoth-client/assets/queries.graphql @@ -34,8 +34,14 @@ fragment Funding on Funding { } } +fragment PublisherFields on Publisher { + publisherId + publisherName +} + fragment Work on Work { workId + updatedAtWithRelations workStatus workType reference @@ -334,3 +340,91 @@ query WorksLastUpdatedQuery( updatedAtWithRelations } } + +query OaiWorksQuery( + $limit: Int!, + $offset: Int!, + $publishers: [Uuid!], + $abstractsLimit: Int!, + $issuesLimit: Int!, + $languagesLimit: Int!, + $publicationsLimit: Int!, + $subjectsLimit: Int!, + $titlesLimit: Int!, + $fundingsLimit: Int!, + $relationsLimit: Int!, + $referencesLimit: Int! +) { + works( + limit: $limit, + offset: $offset, + publishers: $publishers, + workStatuses: [ACTIVE], + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + ...Work + } +} + +query OaiBooksQuery( + $limit: Int!, + $offset: Int!, + $publishers: [Uuid!], + $abstractsLimit: Int!, + $issuesLimit: Int!, + $languagesLimit: Int!, + $publicationsLimit: Int!, + $subjectsLimit: Int!, + $titlesLimit: Int!, + $fundingsLimit: Int!, + $relationsLimit: Int!, + $referencesLimit: Int! +) { + books( + limit: $limit, + offset: $offset, + publishers: $publishers, + workStatuses: [ACTIVE], + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + ...Work + } +} + +query OaiWorkCountQuery( + $publishers: [Uuid!] +) { + workCount(publishers: $publishers, workStatuses: [ACTIVE]) +} + +query OaiBookCountQuery( + $publishers: [Uuid!] +) { + bookCount(publishers: $publishers, workStatuses: [ACTIVE]) +} + +query OaiLatestWorksUpdatedQuery { + works( + workStatuses: [ACTIVE], + limit: 1, + order: {field: UPDATED_AT_WITH_RELATIONS, direction: DESC} + ) { + updatedAtWithRelations + } +} + +query OaiEarliestWorksUpdatedQuery { + works( + workStatuses: [ACTIVE], + limit: 1, + order: {field: UPDATED_AT_WITH_RELATIONS, direction: ASC} + ) { + updatedAtWithRelations + } +} + +query PublishersQuery { + publishers(limit: 10000) { + ...PublisherFields + } +} diff --git a/thoth-client/src/lib.rs b/thoth-client/src/lib.rs index 6ac9ce99e..5e4386940 100644 --- a/thoth-client/src/lib.rs +++ b/thoth-client/src/lib.rs @@ -7,8 +7,12 @@ pub use crate::parameters::QueryParameters; use crate::parameters::{WorkQueryVariables, WorksQueryVariables}; pub use crate::queries::work_query::*; use crate::queries::{ + oai_book_count_query, oai_books_query, oai_earliest_works_updated_query, + oai_latest_works_updated_query, oai_work_count_query, oai_works_query, publishers_query, work_count_query, work_last_updated_query, work_query, works_last_updated_query, works_query, - WorkCountQuery, WorkLastUpdatedQuery, WorkQuery, WorksLastUpdatedQuery, WorksQuery, + OaiBookCountQuery, OaiBooksQuery, OaiEarliestWorksUpdatedQuery, OaiLatestWorksUpdatedQuery, + OaiWorkCountQuery, OaiWorksQuery, PublishersQuery, WorkCountQuery, WorkLastUpdatedQuery, + WorkQuery, WorksLastUpdatedQuery, WorksQuery, }; pub use chrono::NaiveDate; use graphql_client::GraphQLQuery; @@ -23,6 +27,12 @@ use thoth_api::model::Timestamp; use thoth_errors::{ThothError, ThothResult}; use uuid::Uuid; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Publisher { + pub publisher_id: Uuid, + pub publisher_name: String, +} + /// Maximum number of allowed request retries attempts. const MAX_REQUEST_RETRIES: u32 = 5; @@ -268,4 +278,117 @@ impl ThothClient { None => Err(ThothError::EntityNotFound), } } + + pub async fn get_oai_works( + &self, + publishers: Option>, + limit: i64, + offset: i64, + parameters: QueryParameters, + ) -> ThothResult> { + let variables: oai_works_query::Variables = + WorksQueryVariables::new(publishers, limit, offset, parameters).into(); + let request_body = OaiWorksQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.works.into_iter().map(Into::into).collect()), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_books( + &self, + publishers: Option>, + limit: i64, + offset: i64, + parameters: QueryParameters, + ) -> ThothResult> { + let variables: oai_books_query::Variables = + WorksQueryVariables::new(publishers, limit, offset, parameters).into(); + let request_body = OaiBooksQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.books.into_iter().map(Into::into).collect()), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_work_count(&self, publishers: Option>) -> ThothResult { + let variables = oai_work_count_query::Variables { publishers }; + let request_body = OaiWorkCountQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.work_count), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_book_count(&self, publishers: Option>) -> ThothResult { + let variables = oai_book_count_query::Variables { publishers }; + let request_body = OaiBookCountQuery::build_query(variables); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data.book_count), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_latest_works_updated(&self) -> ThothResult { + let request_body = + OaiLatestWorksUpdatedQuery::build_query(oai_latest_works_updated_query::Variables {}); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => data + .works + .first() + .map(|work| work.updated_at_with_relations) + .ok_or(ThothError::EntityNotFound), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_oai_earliest_works_updated(&self) -> ThothResult { + let request_body = OaiEarliestWorksUpdatedQuery::build_query( + oai_earliest_works_updated_query::Variables {}, + ); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => data + .works + .first() + .map(|work| work.updated_at_with_relations) + .ok_or(ThothError::EntityNotFound), + None => Err(ThothError::EntityNotFound), + } + } + + pub async fn get_publishers(&self) -> ThothResult> { + let request_body = PublishersQuery::build_query(publishers_query::Variables {}); + let res = self.post_request(&request_body).await.await?; + let response_body: Response = + self.parse_graphql_response(res).await?; + match response_body.data { + Some(data) => Ok(data + .publishers + .into_iter() + .map(|publisher| Publisher { + publisher_id: publisher.publisher_id, + publisher_name: publisher.publisher_name, + }) + .collect()), + None => Err(ThothError::EntityNotFound), + } + } } diff --git a/thoth-client/src/parameters.rs b/thoth-client/src/parameters.rs index cdc4dfb1a..b802d05ac 100644 --- a/thoth-client/src/parameters.rs +++ b/thoth-client/src/parameters.rs @@ -1,4 +1,4 @@ -use crate::queries::{work_query, works_query}; +use crate::queries::{oai_books_query, oai_works_query, work_query, works_query}; use uuid::Uuid; /// A set of booleans to toggle directives in the GraphQL queries @@ -288,10 +288,120 @@ impl From for works_query::Variables { } } +impl From for oai_works_query::Variables { + fn from(v: WorksQueryVariables) -> Self { + oai_works_query::Variables { + publishers: v.publishers, + limit: v.limit, + offset: v.offset, + abstracts_limit: if v.parameters.with_abstracts { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + issues_limit: if v.parameters.with_issues { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + languages_limit: if v.parameters.with_languages { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + publications_limit: if v.parameters.with_publications { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + subjects_limit: if v.parameters.with_subjects { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + fundings_limit: if v.parameters.with_fundings { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + relations_limit: if v.parameters.with_relations { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + references_limit: if v.parameters.with_references { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + titles_limit: if v.parameters.with_titles { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + } + } +} + +impl From for oai_books_query::Variables { + fn from(v: WorksQueryVariables) -> Self { + oai_books_query::Variables { + publishers: v.publishers, + limit: v.limit, + offset: v.offset, + abstracts_limit: if v.parameters.with_abstracts { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + issues_limit: if v.parameters.with_issues { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + languages_limit: if v.parameters.with_languages { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + publications_limit: if v.parameters.with_publications { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + subjects_limit: if v.parameters.with_subjects { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + fundings_limit: if v.parameters.with_fundings { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + relations_limit: if v.parameters.with_relations { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + references_limit: if v.parameters.with_references { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_NONE + }, + titles_limit: if v.parameters.with_titles { + FILTER_INCLUDE_ALL + } else { + FILTER_INCLUDE_CANONICAL + }, + } + } +} + #[cfg(test)] mod tests { use super::*; - use crate::queries::{work_query, works_query}; + use crate::queries::{oai_books_query, oai_works_query, work_query, works_query}; #[test] fn test_default_query_parameters() { @@ -496,4 +606,63 @@ mod tests { } ); } + + #[test] + fn test_convert_parameters_to_oai_works_query_variables() { + let publisher_id: Uuid = Uuid::parse_str("00000000-0000-0000-AAAA-000000000001").unwrap(); + let publishers = Some(vec![publisher_id]); + let parameters = QueryParameters::new() + .with_issues() + .with_languages() + .with_publications(); + + let variables: oai_works_query::Variables = + WorksQueryVariables::new(publishers.clone(), 50, 25, parameters).into(); + + assert_eq!( + variables, + oai_works_query::Variables { + publishers: publishers.clone(), + limit: 50, + offset: 25, + abstracts_limit: FILTER_INCLUDE_CANONICAL, + issues_limit: FILTER_INCLUDE_ALL, + languages_limit: FILTER_INCLUDE_ALL, + publications_limit: FILTER_INCLUDE_ALL, + subjects_limit: FILTER_INCLUDE_NONE, + fundings_limit: FILTER_INCLUDE_NONE, + relations_limit: FILTER_INCLUDE_NONE, + references_limit: FILTER_INCLUDE_NONE, + titles_limit: FILTER_INCLUDE_CANONICAL, + } + ); + } + + #[test] + fn test_convert_parameters_to_oai_books_query_variables() { + let publisher_id: Uuid = Uuid::parse_str("00000000-0000-0000-AAAA-000000000001").unwrap(); + let publishers = Some(vec![publisher_id]); + let parameters = QueryParameters::new().with_all(); + + let variables: oai_books_query::Variables = + WorksQueryVariables::new(publishers.clone(), 10, 5, parameters).into(); + + assert_eq!( + variables, + oai_books_query::Variables { + publishers, + limit: 10, + offset: 5, + abstracts_limit: FILTER_INCLUDE_ALL, + issues_limit: FILTER_INCLUDE_ALL, + languages_limit: FILTER_INCLUDE_ALL, + publications_limit: FILTER_INCLUDE_ALL, + subjects_limit: FILTER_INCLUDE_ALL, + fundings_limit: FILTER_INCLUDE_ALL, + relations_limit: FILTER_INCLUDE_ALL, + references_limit: FILTER_INCLUDE_ALL, + titles_limit: FILTER_INCLUDE_ALL, + } + ); + } } diff --git a/thoth-client/src/queries.rs b/thoth-client/src/queries.rs index 6f25c9f18..993dea387 100644 --- a/thoth-client/src/queries.rs +++ b/thoth-client/src/queries.rs @@ -98,6 +98,69 @@ pub struct WorkLastUpdatedQuery; )] pub struct WorksLastUpdatedQuery; +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiWorksQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiBooksQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiWorkCountQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiBookCountQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiLatestWorksUpdatedQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct OaiEarliestWorksUpdatedQuery; + +#[derive(GraphQLQuery)] +#[graphql( + schema_path = "assets/schema.graphql", + query_path = "assets/queries.graphql", + response_derives = "Debug,Clone,Deserialize,Serialize,PartialEq", + variables_derives = "Debug,PartialEq" +)] +pub struct PublishersQuery; + // Needed to set work_query::Work as the canonical struct for the shared fragment in the two queries // until https://github.com/graphql-rust/graphql-client/issues/312 gets fixed impl From for work_query::Work { @@ -107,6 +170,20 @@ impl From for work_query::Work { } } +impl From for work_query::Work { + fn from(w: oai_works_query::Work) -> Self { + let se = serde_json::to_string(&w).unwrap(); + serde_json::from_str(&se).unwrap() + } +} + +impl From for work_query::Work { + fn from(w: oai_books_query::Work) -> Self { + let se = serde_json::to_string(&w).unwrap(); + serde_json::from_str(&se).unwrap() + } +} + // As above: enables shared processing of parent Works and child RelatedWorks in doideposit format impl From for work_query::WorkRelationsRelatedWork { fn from(w: work_query::Work) -> Self { diff --git a/thoth-export-server/src/bibtex/bibtex_thoth.rs b/thoth-export-server/src/bibtex/bibtex_thoth.rs index 8f871de00..69e805229 100644 --- a/thoth-export-server/src/bibtex/bibtex_thoth.rs +++ b/thoth-export-server/src/bibtex/bibtex_thoth.rs @@ -309,6 +309,7 @@ mod tests { }, ], work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/csv/csv_thoth.rs b/thoth-export-server/src/csv/csv_thoth.rs index 61c65b9eb..6f9790acb 100644 --- a/thoth-export-server/src/csv/csv_thoth.rs +++ b/thoth-export-server/src/csv/csv_thoth.rs @@ -535,6 +535,7 @@ mod tests { lazy_static! { static ref TEST_WORK: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/csv/kbart_oclc.rs b/thoth-export-server/src/csv/kbart_oclc.rs index cd378f723..9e0fb1de9 100644 --- a/thoth-export-server/src/csv/kbart_oclc.rs +++ b/thoth-export-server/src/csv/kbart_oclc.rs @@ -247,6 +247,10 @@ mod tests { fn test_kbart_oclc() { let mut test_work: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, // We must manually set full_title within this test framework, but // Thoth UI compiles it automatically from title + (optional) subtitle diff --git a/thoth-export-server/src/data.rs b/thoth-export-server/src/data.rs index 6db320e3c..4d48cea1d 100644 --- a/thoth-export-server/src/data.rs +++ b/thoth-export-server/src/data.rs @@ -79,6 +79,18 @@ lazy_static! { format: concat!(env!("THOTH_EXPORT_API"), "/formats/json"), accepted_by: vec![concat!(env!("THOTH_EXPORT_API"), "/platforms/thoth"),], }, + Specification { + id: "dublin_core::thoth", + name: "Thoth Dublin Core", + format: concat!(env!("THOTH_EXPORT_API"), "/formats/dublin_core"), + accepted_by: vec![concat!(env!("THOTH_EXPORT_API"), "/platforms/thoth"),], + }, + Specification { + id: "openaire::thoth", + name: "Thoth OpenAIRE", + format: concat!(env!("THOTH_EXPORT_API"), "/formats/openaire"), + accepted_by: vec![concat!(env!("THOTH_EXPORT_API"), "/platforms/thoth"),], + }, Specification { id: "kbart::oclc", name: "OCLC KBART", @@ -138,6 +150,11 @@ lazy_static! { accepts: vec![ concat!(env!("THOTH_EXPORT_API"), "/specifications/csv::thoth"), concat!(env!("THOTH_EXPORT_API"), "/specifications/json::thoth"), + concat!( + env!("THOTH_EXPORT_API"), + "/specifications/dublin_core::thoth" + ), + concat!(env!("THOTH_EXPORT_API"), "/specifications/openaire::thoth"), concat!( env!("THOTH_EXPORT_API"), "/specifications/marc21record::thoth" @@ -356,6 +373,24 @@ lazy_static! { "/specifications/json::thoth" ),], }, + Format { + id: "dublin_core", + name: "Dublin Core", + version: None, + specifications: vec![concat!( + env!("THOTH_EXPORT_API"), + "/specifications/dublin_core::thoth" + ),], + }, + Format { + id: "openaire", + name: "OpenAIRE", + version: None, + specifications: vec![concat!( + env!("THOTH_EXPORT_API"), + "/specifications/openaire::thoth" + ),], + }, Format { id: "kbart", name: "KBART", @@ -522,9 +557,10 @@ mod tests { #[test] fn test_format_id_derives_from_name_and_version() { for f in ALL_FORMATS.iter() { + let base = f.name.to_lowercase().replace(' ', "_"); let id_should_be = match f.version { - Some(version) => format!("{}_{}", f.name.to_lowercase(), version), - None => f.name.to_lowercase().to_string(), + Some(version) => format!("{}_{}", base, version), + None => base, }; assert_eq!(String::from(f.id), id_should_be) } diff --git a/thoth-export-server/src/json/json_thoth.rs b/thoth-export-server/src/json/json_thoth.rs index 6e8192a7e..06cddad47 100644 --- a/thoth-export-server/src/json/json_thoth.rs +++ b/thoth-export-server/src/json/json_thoth.rs @@ -77,6 +77,7 @@ mod tests { lazy_static! { static ref TEST_WORK: Work = Work { work_id: Uuid::from_str("00000000-0000-0000-aaaa-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-cccc-000000000001").unwrap(), @@ -517,6 +518,7 @@ mod tests { const TEST_RESULT: &str = r#" "workId": "00000000-0000-0000-aaaa-000000000001", + "updatedAtWithRelations": "2024-01-01T00:00:00Z", "workStatus": "ACTIVE", "workType": "MONOGRAPH", "reference": null, diff --git a/thoth-export-server/src/lib.rs b/thoth-export-server/src/lib.rs index 175229051..9e0615a80 100644 --- a/thoth-export-server/src/lib.rs +++ b/thoth-export-server/src/lib.rs @@ -1,3 +1,5 @@ +#![recursion_limit = "1024"] + use std::io; use std::time::Duration; diff --git a/thoth-export-server/src/marc21/marc21record_thoth.rs b/thoth-export-server/src/marc21/marc21record_thoth.rs index efc9bfea5..db3580725 100644 --- a/thoth-export-server/src/marc21/marc21record_thoth.rs +++ b/thoth-export-server/src/marc21/marc21record_thoth.rs @@ -807,6 +807,10 @@ pub(crate) mod tests { pub(crate) fn test_work() -> Work { Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339( + "2024-01-01T00:00:00Z", + ) + .unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/record.rs b/thoth-export-server/src/record.rs index 7b9384704..ca80550c6 100644 --- a/thoth-export-server/src/record.rs +++ b/thoth-export-server/src/record.rs @@ -18,9 +18,9 @@ use crate::json::{JsonSpecification, JsonThoth}; use crate::marc21::{Marc21MarkupThoth, Marc21RecordThoth, Marc21Specification}; use crate::specification_query::SpecificationQuery; use crate::xml::{ - DoiDepositCrossref, Marc21XmlThoth, Onix21EbscoHost, Onix21ProquestEbrary, Onix31Thoth, - Onix3GoogleBooks, Onix3Jstor, Onix3Oapen, Onix3Overdrive, Onix3ProjectMuse, Onix3Thoth, - XmlSpecification, + DoiDepositCrossref, DublinCoreThoth, Marc21XmlThoth, Onix21EbscoHost, Onix21ProquestEbrary, + Onix31Thoth, Onix3GoogleBooks, Onix3Jstor, Onix3Oapen, Onix3Overdrive, Onix3ProjectMuse, + Onix3Thoth, OpenaireThoth, XmlSpecification, }; pub const DELIMITER_COMMA: u8 = b','; @@ -47,6 +47,8 @@ pub(crate) enum MetadataSpecification { Marc21RecordThoth(Marc21RecordThoth), Marc21MarkupThoth(Marc21MarkupThoth), Marc21XmlThoth(Marc21XmlThoth), + DublinCoreThoth(DublinCoreThoth), + OpenaireThoth(OpenaireThoth), } pub(crate) struct MetadataRecord { @@ -103,6 +105,8 @@ impl MetadataRecord { MetadataSpecification::Marc21RecordThoth(_) => Self::MARC_MIME_TYPE, MetadataSpecification::Marc21MarkupThoth(_) => Self::TXT_MIME_TYPE, MetadataSpecification::Marc21XmlThoth(_) => Self::XML_MIME_TYPE, + MetadataSpecification::DublinCoreThoth(_) => Self::XML_MIME_TYPE, + MetadataSpecification::OpenaireThoth(_) => Self::XML_MIME_TYPE, } } @@ -125,6 +129,8 @@ impl MetadataRecord { MetadataSpecification::Marc21RecordThoth(_) => self.marc_record_file_name(), MetadataSpecification::Marc21MarkupThoth(_) => self.marc_markup_file_name(), MetadataSpecification::Marc21XmlThoth(_) => self.xml_file_name(), + MetadataSpecification::DublinCoreThoth(_) => self.xml_file_name(), + MetadataSpecification::OpenaireThoth(_) => self.xml_file_name(), } } @@ -273,6 +279,12 @@ impl MetadataRecord { MetadataSpecification::Marc21XmlThoth(marc21xml_thoth) => { marc21xml_thoth.generate(&data) } + MetadataSpecification::DublinCoreThoth(dublin_core_thoth) => { + dublin_core_thoth.generate(&data, None) + } + MetadataSpecification::OpenaireThoth(openaire_thoth) => { + openaire_thoth.generate(&data, None) + } } } } @@ -343,6 +355,8 @@ impl FromStr for MetadataSpecification { Marc21MarkupThoth {}, )), "marc21xml::thoth" => Ok(MetadataSpecification::Marc21XmlThoth(Marc21XmlThoth {})), + "dublin_core::thoth" => Ok(MetadataSpecification::DublinCoreThoth(DublinCoreThoth {})), + "openaire::thoth" => Ok(MetadataSpecification::OpenaireThoth(OpenaireThoth {})), _ => Err(ThothError::InvalidMetadataSpecification(input.to_string())), } } @@ -368,6 +382,8 @@ impl Display for MetadataSpecification { MetadataSpecification::Marc21RecordThoth(_) => "marc21record::thoth", MetadataSpecification::Marc21MarkupThoth(_) => "marc21markup::thoth", MetadataSpecification::Marc21XmlThoth(_) => "marc21xml::thoth", + MetadataSpecification::DublinCoreThoth(_) => "dublin_core::thoth", + MetadataSpecification::OpenaireThoth(_) => "openaire::thoth", }; write!(f, "{str}") } @@ -536,5 +552,23 @@ mod tests { to_test.file_name(), "marc21xml__thoth__some_id.xml".to_string() ); + let to_test = MetadataRecord::new( + "some_id".to_string(), + MetadataSpecification::DublinCoreThoth(DublinCoreThoth {}), + timestamp, + ); + assert_eq!( + to_test.file_name(), + "dublin_core__thoth__some_id.xml".to_string() + ); + let to_test = MetadataRecord::new( + "some_id".to_string(), + MetadataSpecification::OpenaireThoth(OpenaireThoth {}), + timestamp, + ); + assert_eq!( + to_test.file_name(), + "openaire__thoth__some_id.xml".to_string() + ); } } diff --git a/thoth-export-server/src/specification_query.rs b/thoth-export-server/src/specification_query.rs index fb61189af..0bbdb26dd 100644 --- a/thoth-export-server/src/specification_query.rs +++ b/thoth-export-server/src/specification_query.rs @@ -269,6 +269,71 @@ impl TryFrom for QueryParameters { .with_subjects() .with_languages() .with_fundings()), + MetadataSpecification::DublinCoreThoth(_) => match q.request { + SpecificationRequest::ByWork => Ok(QueryParameters::new() + .with_all_abstracts() + .with_all_titles() + .with_issues() + .with_languages() + .with_publications() + .with_subjects() + .with_fundings() + .with_relations() + .with_references()), + SpecificationRequest::ByPublisher => Err(ThothError::IncompleteMetadataRecord( + "dublin_core::thoth".to_string(), + "Output can only be generated for one work at a time".to_string(), + )), + }, + MetadataSpecification::OpenaireThoth(_) => match q.request { + SpecificationRequest::ByWork => Ok(QueryParameters::new() + .with_all_abstracts() + .with_all_titles() + .with_issues() + .with_languages() + .with_publications() + .with_subjects() + .with_fundings() + .with_relations() + .with_references()), + SpecificationRequest::ByPublisher => Err(ThothError::IncompleteMetadataRecord( + "openaire::thoth".to_string(), + "Output can only be generated for one work at a time".to_string(), + )), + }, } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::record::MetadataSpecification; + use crate::xml::{DublinCoreThoth, OpenaireThoth}; + + #[test] + fn dublin_core_by_publisher_is_unsupported() { + let result = QueryParameters::try_from(QueryConfiguration::by_publisher( + MetadataSpecification::DublinCoreThoth(DublinCoreThoth {}), + )); + assert!(matches!( + result, + Err(ThothError::IncompleteMetadataRecord(spec, message)) + if spec == "dublin_core::thoth" + && message == "Output can only be generated for one work at a time" + )); + } + + #[test] + fn openaire_by_publisher_is_unsupported() { + let result = QueryParameters::try_from(QueryConfiguration::by_publisher( + MetadataSpecification::OpenaireThoth(OpenaireThoth {}), + )); + assert!(matches!( + result, + Err(ThothError::IncompleteMetadataRecord(spec, message)) + if spec == "openaire::thoth" + && message == "Output can only be generated for one work at a time" + )); + } +} diff --git a/thoth-export-server/src/xml/doideposit_crossref.rs b/thoth-export-server/src/xml/doideposit_crossref.rs index 97447e541..c2062e51f 100644 --- a/thoth-export-server/src/xml/doideposit_crossref.rs +++ b/thoth-export-server/src/xml/doideposit_crossref.rs @@ -1626,6 +1626,7 @@ mod tests { fn test_doideposit_crossref_works() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), @@ -2570,6 +2571,7 @@ mod tests { fn test_doideposit_crossref_isbns_workaround() { let mut test_work = Work { work_id: Uuid::from_str("00000000-0000-0000-AAAA-000000000001").unwrap(), + updated_at_with_relations: thoth_api::model::Timestamp::parse_from_rfc3339("2024-01-01T00:00:00Z").unwrap(), work_status: WorkStatus::ACTIVE, titles: vec![thoth_client::WorkTitles { title_id: Uuid::from_str("00000000-0000-0000-CCCC-000000000001").unwrap(), diff --git a/thoth-export-server/src/xml/dublincore_thoth.rs b/thoth-export-server/src/xml/dublincore_thoth.rs new file mode 100644 index 000000000..3c57ba89e --- /dev/null +++ b/thoth-export-server/src/xml/dublincore_thoth.rs @@ -0,0 +1,956 @@ +use super::{write_element_block, XmlElementBlock, XmlSpecification}; +use std::collections::HashSet; +use std::io::Write; + +use thoth_api::markup::{convert_from_jats, ConversionLimit, MarkupFormat}; +use thoth_client::{ + AbstractType, ContributionType, LanguageRelation, PublicationType, SubjectType, Work, + WorkAbstracts, WorkContributions, WorkLanguages, WorkTitles, +}; +use thoth_errors::{ThothError, ThothResult}; +use xml::writer::events::StartElementBuilder; +#[cfg(test)] +use xml::writer::EmitterConfig; +use xml::writer::{EventWriter, XmlEvent}; + +const DUBLIN_CORE_ERROR: &str = "dublin_core::thoth"; +const BY_WORK_ONLY_MESSAGE: &str = "Output can only be generated for one work at a time"; +const DUBLIN_CORE_NS: &[(&str, &str)] = &[ + ("xmlns:oai_dc", "http://www.openarchives.org/OAI/2.0/oai_dc/"), + ("xmlns:dc", "http://purl.org/dc/elements/1.1/"), + ("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"), + ( + "xsi:schemaLocation", + "http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd", + ), +]; + +#[derive(Copy, Clone)] +pub(crate) struct DublinCoreThoth; + +impl XmlSpecification for DublinCoreThoth { + fn handle_event(w: &mut EventWriter, works: &[Work]) -> ThothResult<()> { + match works { + [] => Err(ThothError::IncompleteMetadataRecord( + DUBLIN_CORE_ERROR.to_string(), + "Not enough data".to_string(), + )), + [work] => XmlElementBlock::::xml_element(work, w), + _ => Err(ThothError::IncompleteMetadataRecord( + DUBLIN_CORE_ERROR.to_string(), + BY_WORK_ONLY_MESSAGE.to_string(), + )), + } + } +} + +impl XmlElementBlock for Work { + fn xml_element(&self, w: &mut EventWriter) -> ThothResult<()> { + write_dublin_core(self, w) + } +} + +fn push_text_element( + xml: &mut EventWriter, + name: &str, + text: &str, +) -> ThothResult<()> { + write_element_block(name, xml, |xml| { + xml.write(XmlEvent::Characters(text)).map_err(|e| e.into()) + }) +} + +fn push_open_tag( + xml: &mut EventWriter, + name: &str, + attrs: &[(&str, &str)], +) -> ThothResult<()> { + let mut event_builder: StartElementBuilder = XmlEvent::start_element(name); + for &(key, value) in attrs { + event_builder = event_builder.attr(key, value); + } + let event: XmlEvent = event_builder.into(); + xml.write(event).map_err(|e| e.into()) +} + +fn push_close_tag(xml: &mut EventWriter, _name: &str) -> ThothResult<()> { + let event: XmlEvent = XmlEvent::end_element().into(); + xml.write(event).map_err(|e| e.into()) +} + +fn normalize_value(value: &str) -> Option { + let value = value.trim(); + (!value.is_empty()).then(|| value.to_string()) +} + +fn push_unique(values: &mut Vec, seen: &mut HashSet, value: impl Into) { + if let Some(value) = normalize_value(&value.into()) { + if seen.insert(value.clone()) { + values.push(value); + } + } +} + +fn doi_url(doi: &thoth_api::model::Doi) -> String { + format!("https://doi.org/{doi}") +} + +fn work_url(work: &Work) -> String { + format!("https://thoth.pub/books/{}", work.work_id) +} + +fn ordered_titles(work: &Work) -> Vec<&WorkTitles> { + let mut titles = work.titles.iter().collect::>(); + titles.sort_by(|left, right| { + right + .canonical + .cmp(&left.canonical) + .then_with(|| { + left.locale_code + .to_string() + .cmp(&right.locale_code.to_string()) + }) + .then_with(|| left.full_title.cmp(&right.full_title)) + }); + titles +} + +fn ordered_abstracts(work: &Work) -> Vec<&WorkAbstracts> { + fn priority(abstract_type: &AbstractType) -> u8 { + match abstract_type { + AbstractType::SHORT => 0, + AbstractType::LONG => 1, + _ => 2, + } + } + + let mut abstracts = work.abstracts.iter().collect::>(); + abstracts.sort_by(|left, right| { + priority(&left.abstract_type) + .cmp(&priority(&right.abstract_type)) + .then_with(|| right.canonical.cmp(&left.canonical)) + .then_with(|| { + left.locale_code + .to_string() + .cmp(&right.locale_code.to_string()) + }) + .then_with(|| left.content.cmp(&right.content)) + }); + abstracts +} + +fn ordered_languages(work: &Work) -> Vec<&WorkLanguages> { + fn priority(language_relation: &LanguageRelation) -> u8 { + match language_relation { + LanguageRelation::ORIGINAL => 0, + LanguageRelation::TRANSLATED_FROM => 1, + LanguageRelation::TRANSLATED_INTO => 2, + _ => 3, + } + } + + let mut languages = work.languages.iter().collect::>(); + languages.sort_by(|left, right| { + priority(&left.language_relation) + .cmp(&priority(&right.language_relation)) + .then_with(|| { + left.language_code + .to_string() + .cmp(&right.language_code.to_string()) + }) + }); + languages +} + +fn convert_abstract_to_text(abstract_record: &WorkAbstracts) -> ThothResult { + convert_from_jats( + &abstract_record.content, + MarkupFormat::PlainText, + ConversionLimit::Abstract, + ) +} + +fn creators(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type == ContributionType::AUTHOR) +} + +fn contributors(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type != ContributionType::AUTHOR) +} + +fn publication_type_value(publication_type: &PublicationType) -> &'static str { + match publication_type { + PublicationType::HARDBACK => "hardback", + PublicationType::PAPERBACK => "paperback", + PublicationType::PDF => "application/pdf", + PublicationType::EPUB => "application/epub+zip", + PublicationType::XML => "text/xml", + PublicationType::HTML => "text/html", + PublicationType::DOCX => { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + PublicationType::MP3 => "audio/mpeg", + PublicationType::WAV => "audio/wav", + PublicationType::MOBI => "application/x-mobipocket-ebook", + PublicationType::AZW3 => "application/vnd.amazon.ebook", + PublicationType::FICTION_BOOK => "application/x-fictionbook+xml", + PublicationType::Other(_) => "application/octet-stream", + } +} + +fn dc_type(work: &Work) -> &'static str { + match work.work_type { + thoth_client::WorkType::JOURNAL_ISSUE => "issue", + thoth_client::WorkType::BOOK_CHAPTER => "chapter", + thoth_client::WorkType::Other(_) => "book", + _ => "book", + } +} + +fn normalized_license_name(license: &str) -> &str { + match license.trim_end_matches('/') { + "http://creativecommons.org/publicdomain/zero/1.0" => "CC0 1.0 Universal", + "http://creativecommons.org/licenses/by/4.0" => "CC BY 4.0", + "http://creativecommons.org/licenses/by-sa/4.0" => "CC BY-SA 4.0", + "http://creativecommons.org/licenses/by-nc/4.0" => "CC BY-NC 4.0", + "http://creativecommons.org/licenses/by-nc-sa/4.0" => "CC BY-NC-SA 4.0", + "http://creativecommons.org/licenses/by-nd/4.0" => "CC BY-ND 4.0", + "http://creativecommons.org/licenses/by-nc-nd/4.0" => "CC BY-NC-ND 4.0", + "http://creativecommons.org/licenses/by/3.0" => "CC BY 3.0", + "http://creativecommons.org/licenses/by-sa/3.0" => "CC BY-SA 3.0", + "http://creativecommons.org/licenses/by-nc/3.0" => "CC BY-NC 3.0", + "http://creativecommons.org/licenses/by-nc-sa/3.0" => "CC BY-NC-SA 3.0", + "http://creativecommons.org/licenses/by-nd/3.0" => "CC BY-ND 3.0", + "http://creativecommons.org/licenses/by-nc-nd/3.0" => "CC BY-NC-ND 3.0", + _ => license, + } +} + +fn write_dublin_core(work: &Work, xml: &mut EventWriter) -> ThothResult<()> { + push_open_tag(xml, "oai_dc:dc", DUBLIN_CORE_NS)?; + + let mut title_values = Vec::new(); + let mut title_seen = HashSet::new(); + for title in ordered_titles(work) { + push_unique(&mut title_values, &mut title_seen, title.full_title.clone()); + } + for title in title_values { + push_text_element(xml, "dc:title", &title)?; + } + + for creator in creators(work) { + push_text_element(xml, "dc:creator", &creator.full_name)?; + } + + let mut subject_values = Vec::new(); + let mut subject_seen = HashSet::new(); + for subject in &work.subjects { + let value = match subject.subject_type { + SubjectType::KEYWORD | SubjectType::CUSTOM => subject.subject_code.clone(), + SubjectType::THEMA => format!("THEMA:{}", subject.subject_code), + _ => format!("{}:{}", subject.subject_type, subject.subject_code), + }; + push_unique(&mut subject_values, &mut subject_seen, value); + } + for subject in subject_values { + push_text_element(xml, "dc:subject", &subject)?; + } + + let mut description_values = Vec::new(); + let mut description_seen = HashSet::new(); + for abstract_record in ordered_abstracts(work) { + push_unique( + &mut description_values, + &mut description_seen, + convert_abstract_to_text(abstract_record)?, + ); + } + if let Some(toc) = work.toc.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + toc.to_string(), + ); + } + if let Some(general_note) = work.general_note.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + general_note.to_string(), + ); + } + if let Some(bibliography_note) = work.bibliography_note.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + bibliography_note.to_string(), + ); + } + if let Some(cover_caption) = work.cover_caption.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + cover_caption.to_string(), + ); + } + if let Some(page_breakdown) = work.page_breakdown.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + page_breakdown.to_string(), + ); + } + for description in description_values { + push_text_element(xml, "dc:description", &description)?; + } + + push_text_element(xml, "dc:publisher", &work.imprint.publisher.publisher_name)?; + + for contributor in contributors(work) { + push_text_element(xml, "dc:contributor", &contributor.full_name)?; + } + + if let Some(publication_date) = &work.publication_date { + push_text_element(xml, "dc:date", &publication_date.to_string())?; + } + + push_text_element(xml, "dc:type", dc_type(work))?; + + let mut format_values = Vec::new(); + let mut format_seen = HashSet::new(); + for publication in &work.publications { + push_unique( + &mut format_values, + &mut format_seen, + publication_type_value(&publication.publication_type).to_string(), + ); + } + for format_value in format_values { + push_text_element(xml, "dc:format", &format_value)?; + } + + let mut identifier_values = Vec::new(); + let mut identifier_seen = HashSet::new(); + push_unique(&mut identifier_values, &mut identifier_seen, work_url(work)); + if let Some(doi) = &work.doi { + push_unique(&mut identifier_values, &mut identifier_seen, doi_url(doi)); + } + if let Some(landing_page) = work.landing_page.as_deref() { + push_unique( + &mut identifier_values, + &mut identifier_seen, + landing_page.to_string(), + ); + } + for publication in &work.publications { + if let Some(isbn) = &publication.isbn { + push_unique( + &mut identifier_values, + &mut identifier_seen, + format!("urn:isbn:{isbn}"), + ); + } + } + if let Some(lccn) = work.lccn.as_deref() { + push_unique( + &mut identifier_values, + &mut identifier_seen, + format!("urn:lccn:{lccn}"), + ); + } + if let Some(oclc) = work.oclc.as_deref() { + push_unique( + &mut identifier_values, + &mut identifier_seen, + format!("urn:oclc:{oclc}"), + ); + } + for identifier in identifier_values { + push_text_element(xml, "dc:identifier", &identifier)?; + } + + let mut language_values = Vec::new(); + let mut language_seen = HashSet::new(); + for language in ordered_languages(work) { + push_unique( + &mut language_values, + &mut language_seen, + language.language_code.to_string().to_lowercase(), + ); + } + for language in language_values { + push_text_element(xml, "dc:language", &language)?; + } + + let mut relation_values = Vec::new(); + let mut relation_seen = HashSet::new(); + for relation in &work.relations { + if let Some(doi) = &relation.related_work.doi { + push_unique(&mut relation_values, &mut relation_seen, doi_url(doi)); + } + if let Some(landing_page) = relation.related_work.landing_page.as_deref() { + push_unique( + &mut relation_values, + &mut relation_seen, + landing_page.to_string(), + ); + } + for publication in &relation.related_work.publications { + if let Some(isbn) = &publication.isbn { + push_unique( + &mut relation_values, + &mut relation_seen, + format!("urn:isbn:{isbn}"), + ); + } + } + } + for relation in relation_values { + push_text_element(xml, "dc:relation", &relation)?; + } + + let mut rights_values = Vec::new(); + let mut rights_seen = HashSet::new(); + if let Some(license) = work.license.as_deref() { + push_unique(&mut rights_values, &mut rights_seen, license.to_string()); + push_unique( + &mut rights_values, + &mut rights_seen, + normalized_license_name(license).to_string(), + ); + } + if let Some(copyright_holder) = work.copyright_holder.as_deref() { + push_unique( + &mut rights_values, + &mut rights_seen, + format!("Copyright holder: {copyright_holder}"), + ); + } + for rights in rights_values { + push_text_element(xml, "dc:rights", &rights)?; + } + + push_close_tag(xml, "oai_dc:dc") +} + +#[cfg(test)] +fn map_dublin_core(work: &Work) -> ThothResult { + let mut buffer = Vec::new(); + let mut writer = EmitterConfig::new() + .perform_indent(true) + .create_writer(&mut buffer); + XmlElementBlock::::xml_element(work, &mut writer) + .map(|_| buffer) + .and_then(|xml| { + String::from_utf8(xml) + .map_err(|_| ThothError::InternalError("Could not parse XML".to_string())) + }) +} + +#[cfg(test)] +pub(crate) mod test_support { + use super::*; + use serde_json::json; + use std::{ + fs, + path::PathBuf, + process::Command, + time::{SystemTime, UNIX_EPOCH}, + }; + + pub(crate) fn fixture_work() -> Work { + let value = json!({ + "workId": "00000000-0000-0000-0000-000000000111", + "updatedAtWithRelations": "2024-12-31T12:00:00Z", + "workStatus": "ACTIVE", + "workType": "MONOGRAPH", + "reference": "BK-111", + "edition": 2, + "doi": "https://doi.org/10.12345/example.111", + "publicationDate": "2024-02-15", + "withdrawnDate": null, + "license": "http://creativecommons.org/licenses/by/4.0/", + "copyrightHolder": "Example Author", + "generalNote": "General availability note.", + "bibliographyNote": "Includes bibliographical references.", + "place": "London", + "pageCount": 210, + "pageBreakdown": "xii + 198 pages", + "firstPage": "1", + "lastPage": "198", + "pageInterval": null, + "imageCount": 12, + "tableCount": 3, + "audioCount": null, + "videoCount": null, + "landingPage": "https://example.org/books/111", + "toc": "Part I; Part II", + "lccn": "2023123456", + "oclc": "123456789", + "coverUrl": "https://example.org/cover.png", + "coverCaption": "Front cover image.", + "titles": [ + { + "titleId": "00000000-0000-0000-0000-000000000201", + "localeCode": "EN", + "fullTitle": "Canonical Title: A Story", + "title": "Canonical Title", + "subtitle": "A Story", + "canonical": true + }, + { + "titleId": "00000000-0000-0000-0000-000000000202", + "localeCode": "DE", + "fullTitle": "Alternativer Titel", + "title": "Alternativer Titel", + "subtitle": null, + "canonical": false + } + ], + "abstracts": [ + { + "abstractId": "00000000-0000-0000-0000-000000000301", + "workId": "00000000-0000-0000-0000-000000000111", + "content": "

Short abstract text.

", + "localeCode": "EN", + "abstractType": "SHORT", + "canonical": true + }, + { + "abstractId": "00000000-0000-0000-0000-000000000302", + "workId": "00000000-0000-0000-0000-000000000111", + "content": "

Long abstract text.

", + "localeCode": "EN", + "abstractType": "LONG", + "canonical": true + }, + { + "abstractId": "00000000-0000-0000-0000-000000000303", + "workId": "00000000-0000-0000-0000-000000000111", + "content": "

Ausführliche Zusammenfassung.

", + "localeCode": "DE", + "abstractType": "LONG", + "canonical": false + } + ], + "imprint": { + "imprintName": "Example Imprint", + "imprintUrl": null, + "crossmarkDoi": null, + "defaultCurrency": "EUR", + "defaultPlace": "London", + "defaultLocale": "EN", + "publisher": { + "publisherName": "Open Access Press", + "publisherShortname": "OAP", + "publisherUrl": "https://example.org/publisher", + "accessibilityStatement": null, + "contacts": [] + } + }, + "issues": [ + { + "issueOrdinal": 7, + "issueNumber": 7, + "series": { + "seriesId": "00000000-0000-0000-0000-000000000401", + "seriesType": "JOURNAL", + "seriesName": "Open Access Series", + "issnPrint": null, + "issnDigital": null, + "seriesUrl": null, + "seriesDescription": null, + "seriesCfpUrl": null + } + } + ], + "contributions": [ + { + "contributionType": "AUTHOR", + "firstName": "Ada", + "lastName": "Lovelace", + "fullName": "Ada Lovelace", + "mainContribution": true, + "biographies": [], + "contributionOrdinal": 1, + "contributor": { + "orcid": "https://orcid.org/0000-0002-0000-0001", + "website": null + }, + "affiliations": [ + { + "position": "Researcher", + "affiliationOrdinal": 1, + "institution": { + "institutionName": "Example University", + "institutionDoi": null, + "ror": "https://ror.org/02vxh6m30", + "countryCode": "GB" + } + } + ] + }, + { + "contributionType": "EDITOR", + "firstName": "Grace", + "lastName": "Hopper", + "fullName": "Grace Hopper", + "mainContribution": false, + "biographies": [], + "contributionOrdinal": 2, + "contributor": { + "orcid": null, + "website": null + }, + "affiliations": [] + } + ], + "languages": [ + { + "languageCode": "ENG", + "languageRelation": "ORIGINAL" + }, + { + "languageCode": "DEU", + "languageRelation": "TRANSLATED_INTO" + } + ], + "publications": [ + { + "publicationId": "00000000-0000-0000-0000-000000000501", + "publicationType": "PDF", + "isbn": "978-1-4028-9462-6", + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [ + { + "landingPage": "https://example.org/books/111", + "fullTextUrl": "https://example.org/books/111.pdf", + "locationPlatform": "OTHER", + "canonical": true + } + ] + }, + { + "publicationId": "00000000-0000-0000-0000-000000000502", + "publicationType": "XML", + "isbn": "978-92-95055-02-5", + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [] + } + ], + "subjects": [ + { + "subjectCode": "Open Access", + "subjectType": "KEYWORD", + "subjectOrdinal": 1 + }, + { + "subjectCode": "Scholarly Publishing", + "subjectType": "CUSTOM", + "subjectOrdinal": 2 + }, + { + "subjectCode": "LAN025000", + "subjectType": "BISAC", + "subjectOrdinal": 3 + }, + { + "subjectCode": "QRM", + "subjectType": "THEMA", + "subjectOrdinal": 4 + } + ], + "fundings": [ + { + "program": "Open Science", + "projectName": "Metadata Futures", + "projectShortname": "META-FUT", + "grantNumber": "GA-2024-0001", + "institution": { + "institutionName": "Research Council", + "institutionDoi": null, + "ror": "https://ror.org/03yrm5c26", + "countryCode": "GB" + } + } + ], + "relations": [ + { + "relationType": "HAS_PART", + "relationOrdinal": 1, + "relatedWork": { + "edition": 1, + "doi": "https://doi.org/10.12345/example.related", + "publicationDate": "2023-11-01", + "withdrawnDate": null, + "workStatus": "ACTIVE", + "license": "http://creativecommons.org/licenses/by/4.0/", + "copyrightHolder": "Related Author", + "generalNote": null, + "place": "Leiden", + "firstPage": null, + "lastPage": null, + "pageCount": 120, + "pageInterval": null, + "landingPage": "https://example.org/books/related", + "titles": [ + { + "titleId": "00000000-0000-0000-0000-000000000601", + "localeCode": "EN", + "fullTitle": "Related Work", + "title": "Related Work", + "subtitle": null, + "canonical": true + } + ], + "abstracts": [], + "imprint": { + "crossmarkDoi": null, + "publisher": { + "publisherName": "Open Access Press" + } + }, + "contributions": [], + "languages": [ + { + "languageCode": "ENG", + "languageRelation": "ORIGINAL" + } + ], + "publications": [ + { + "publicationType": "PDF", + "isbn": "978-1-4028-9462-7", + "locations": [] + } + ], + "fundings": [], + "references": [] + } + } + ], + "references": [ + { + "referenceOrdinal": 1, + "doi": null, + "unstructuredCitation": "Doe, J. (2020). The Open Knowledge Handbook.", + "issn": null, + "isbn": null, + "journalTitle": null, + "articleTitle": null, + "seriesTitle": null, + "volumeTitle": null, + "edition": null, + "author": null, + "volume": null, + "issue": null, + "firstPage": null, + "componentNumber": null, + "standardDesignator": null, + "standardsBodyName": null, + "standardsBodyAcronym": null, + "url": null, + "publicationDate": null, + "retrievalDate": null + }, + { + "referenceOrdinal": 2, + "doi": "https://doi.org/10.55555/structured.2", + "unstructuredCitation": null, + "issn": "1234-5678", + "isbn": "978-0-12-345678-9", + "journalTitle": "Metadata Quarterly", + "articleTitle": "Structured Citation Patterns", + "seriesTitle": null, + "volumeTitle": null, + "edition": 1, + "author": "Smith, Jane", + "volume": "12", + "issue": "3", + "firstPage": "45", + "componentNumber": "2", + "standardDesignator": null, + "standardsBodyName": null, + "standardsBodyAcronym": null, + "url": "https://example.org/citation/2", + "publicationDate": "2022-06-01", + "retrievalDate": null + } + ] + }); + serde_json::from_value(value).expect("valid Work fixture") + } + + fn write_temp_file(prefix: &str, extension: &str, content: &str) -> PathBuf { + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time") + .as_nanos(); + let path = std::env::temp_dir().join(format!("{prefix}-{timestamp}.{extension}")); + fs::write(&path, content).expect("write temp file"); + path + } + + fn xmllint_available() -> bool { + Command::new("xmllint") + .arg("--version") + .output() + .is_ok_and(|output| output.status.success()) + } + + pub(crate) fn assert_valid_against_schema(xml: &str, schema_file_name: &str) { + if !xmllint_available() { + return; + } + let local_schema = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join("xsd") + .join(schema_file_name); + let schema = if local_schema.exists() { + local_schema + } else { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../thoth-oai-server/tests/fixtures/xsd") + .join(schema_file_name) + }; + let xml_path = write_temp_file("oai-metadata", "xml", xml); + let output = Command::new("xmllint") + .arg("--noout") + .arg("--schema") + .arg(&schema) + .arg(&xml_path) + .output() + .expect("run xmllint"); + let _ = fs::remove_file(&xml_path); + assert!( + output.status.success(), + "schema validation failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } +} + +#[cfg(test)] +mod tests { + use super::test_support::{assert_valid_against_schema, fixture_work}; + use super::*; + use crate::record::XML_DECLARATION; + + fn assert_precedes(xml: &str, first: &str, second: &str) { + let first_pos = xml + .find(first) + .unwrap_or_else(|| panic!("Could not find `{first}` in XML output")); + let second_pos = xml + .find(second) + .unwrap_or_else(|| panic!("Could not find `{second}` in XML output")); + assert!( + first_pos < second_pos, + "Expected `{first}` to appear before `{second}`" + ); + } + + #[test] + fn xml_publication_type_maps_to_text_xml() { + assert_eq!(publication_type_value(&PublicationType::XML), "text/xml"); + } + + #[test] + fn dublin_core_mapping_is_exhaustive_for_titles_languages_descriptions_and_rights() { + let work = fixture_work(); + let xml = map_dublin_core(&work).expect("map dublin_core"); + + assert!(xml.contains("Canonical Title: A Story")); + assert!(xml.contains("Alternativer Titel")); + assert!(xml.contains("eng")); + assert_eq!(xml.matches("").count(), 2); + assert!(xml.contains("Short abstract text.")); + assert!(xml.contains("Long abstract text.")); + assert!(xml.contains("Part I; Part II")); + assert!(xml.contains("General availability note.")); + assert!( + xml.contains("Includes bibliographical references.") + ); + assert!(xml.contains("Front cover image.")); + assert!(xml.contains("xii + 198 pages")); + assert!(xml.contains("Open Access")); + assert!(xml.contains("BISAC:LAN025000")); + assert!(xml.contains("urn:lccn:2023123456")); + assert!(xml.contains("urn:oclc:123456789")); + assert!(xml.contains("CC BY 4.0")); + assert!(xml.contains("Copyright holder: Example Author")); + assert!(!xml.contains("")); + assert_precedes( + &xml, + "Canonical Title: A Story", + "Alternativer Titel", + ); + assert_precedes( + &xml, + "book", + "application/pdf", + ); + assert_precedes( + &xml, + "http://creativecommons.org/licenses/by/4.0/", + "CC BY 4.0", + ); + + assert_valid_against_schema(&xml, "oai_dc.xsd"); + } + + #[test] + fn generator_returns_single_work_xml_with_declaration() { + let xml = DublinCoreThoth {} + .generate(&[fixture_work()], None) + .expect("single dublin core"); + assert!(xml.starts_with(XML_DECLARATION)); + assert!(!xml.starts_with(&format!("{XML_DECLARATION}\n"))); + assert!(xml.contains("(w: &mut EventWriter, works: &[Work]) -> ThothResult<()> { + match works { + [] => Err(ThothError::IncompleteMetadataRecord( + OPENAIRE_ERROR.to_string(), + "Not enough data".to_string(), + )), + [work] => XmlElementBlock::::xml_element(work, w), + _ => Err(ThothError::IncompleteMetadataRecord( + OPENAIRE_ERROR.to_string(), + BY_WORK_ONLY_MESSAGE.to_string(), + )), + } + } +} + +impl XmlElementBlock for Work { + fn xml_element(&self, w: &mut EventWriter) -> ThothResult<()> { + write_openaire(self, w) + } +} + +fn push_text_element( + xml: &mut EventWriter, + name: &str, + text: &str, +) -> ThothResult<()> { + write_element_block(name, xml, |xml| { + xml.write(XmlEvent::Characters(text)).map_err(|e| e.into()) + }) +} + +fn push_text_element_attrs( + xml: &mut EventWriter, + name: &str, + attrs: &[(&str, String)], + text: &str, +) -> ThothResult<()> { + let mut event_builder: StartElementBuilder = XmlEvent::start_element(name); + for (key, value) in attrs { + event_builder = event_builder.attr(*key, value.as_str()); + } + let event: XmlEvent = event_builder.into(); + xml.write(event)?; + xml.write(XmlEvent::Characters(text))?; + xml.write(XmlEvent::end_element()).map_err(|e| e.into()) +} + +fn push_open_tag( + xml: &mut EventWriter, + name: &str, + attrs: &[(&str, String)], +) -> ThothResult<()> { + let mut event_builder: StartElementBuilder = XmlEvent::start_element(name); + for (key, value) in attrs { + event_builder = event_builder.attr(*key, value.as_str()); + } + let event: XmlEvent = event_builder.into(); + xml.write(event).map_err(|e| e.into()) +} + +fn push_close_tag(xml: &mut EventWriter, _name: &str) -> ThothResult<()> { + let event: XmlEvent = XmlEvent::end_element().into(); + xml.write(event).map_err(|e| e.into()) +} + +fn normalize_value(value: &str) -> Option { + let value = value.trim(); + (!value.is_empty()).then(|| value.to_string()) +} + +fn push_unique(values: &mut Vec, seen: &mut HashSet, value: impl Into) { + if let Some(value) = normalize_value(&value.into()) { + if seen.insert(value.clone()) { + values.push(value); + } + } +} + +fn doi_url(doi: &thoth_api::model::Doi) -> String { + format!("https://doi.org/{doi}") +} + +fn orcid_url(orcid: &thoth_api::model::Orcid) -> String { + format!("https://orcid.org/{orcid}") +} + +fn ror_url(ror: &thoth_api::model::Ror) -> String { + format!("https://ror.org/{ror}") +} + +fn work_url(work: &Work) -> String { + format!("https://thoth.pub/books/{}", work.work_id) +} + +fn oai_identifier(work_id: Uuid) -> String { + format!("{OAI_IDENTIFIER_PREFIX}:{work_id}") +} + +fn canonical_title(work: &Work) -> Option<&WorkTitles> { + work.titles + .iter() + .find(|title| title.canonical) + .or_else(|| work.titles.first()) +} + +fn ordered_titles(work: &Work) -> Vec<&WorkTitles> { + let mut titles = work.titles.iter().collect::>(); + titles.sort_by(|left, right| { + right + .canonical + .cmp(&left.canonical) + .then_with(|| { + left.locale_code + .to_string() + .cmp(&right.locale_code.to_string()) + }) + .then_with(|| left.full_title.cmp(&right.full_title)) + }); + titles +} + +fn ordered_abstracts(work: &Work) -> Vec<&WorkAbstracts> { + fn priority(abstract_type: &AbstractType) -> u8 { + match abstract_type { + AbstractType::SHORT => 0, + AbstractType::LONG => 1, + _ => 2, + } + } + + let mut abstracts = work.abstracts.iter().collect::>(); + abstracts.sort_by(|left, right| { + priority(&left.abstract_type) + .cmp(&priority(&right.abstract_type)) + .then_with(|| right.canonical.cmp(&left.canonical)) + .then_with(|| { + left.locale_code + .to_string() + .cmp(&right.locale_code.to_string()) + }) + .then_with(|| left.content.cmp(&right.content)) + }); + abstracts +} + +fn ordered_languages(work: &Work) -> Vec<&WorkLanguages> { + fn priority(language_relation: &LanguageRelation) -> u8 { + match language_relation { + LanguageRelation::ORIGINAL => 0, + LanguageRelation::TRANSLATED_FROM => 1, + LanguageRelation::TRANSLATED_INTO => 2, + _ => 3, + } + } + + let mut languages = work.languages.iter().collect::>(); + languages.sort_by(|left, right| { + priority(&left.language_relation) + .cmp(&priority(&right.language_relation)) + .then_with(|| { + left.language_code + .to_string() + .cmp(&right.language_code.to_string()) + }) + }); + languages +} + +fn convert_abstract_to_text(abstract_record: &WorkAbstracts) -> ThothResult { + convert_from_jats( + &abstract_record.content, + MarkupFormat::PlainText, + ConversionLimit::Abstract, + ) +} + +fn creators(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type == ContributionType::AUTHOR) +} + +fn contributors(work: &Work) -> impl Iterator { + work.contributions + .iter() + .filter(|contribution| contribution.contribution_type != ContributionType::AUTHOR) +} + +fn personal_name(contribution: &WorkContributions) -> String { + match contribution.first_name.as_deref() { + Some(first_name) if !first_name.is_empty() && !contribution.last_name.is_empty() => { + format!("{}, {}", contribution.last_name, first_name) + } + _ if !contribution.full_name.is_empty() => contribution.full_name.clone(), + _ => contribution.last_name.clone(), + } +} + +fn publication_type_value(publication_type: &PublicationType) -> &'static str { + match publication_type { + PublicationType::HARDBACK => "hardback", + PublicationType::PAPERBACK => "paperback", + PublicationType::PDF => "application/pdf", + PublicationType::EPUB => "application/epub+zip", + PublicationType::XML => "text/xml", + PublicationType::HTML => "text/html", + PublicationType::DOCX => { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + PublicationType::MP3 => "audio/mpeg", + PublicationType::WAV => "audio/wav", + PublicationType::MOBI => "application/x-mobipocket-ebook", + PublicationType::AZW3 => "application/vnd.amazon.ebook", + PublicationType::FICTION_BOOK => "application/x-fictionbook+xml", + PublicationType::Other(_) => "application/octet-stream", + } +} + +fn openaire_resource_type(work: &Work) -> Option<(&'static str, &'static str)> { + match work.work_type { + thoth_client::WorkType::JOURNAL_ISSUE => { + Some(("http://purl.org/coar/resource_type/c_0640", "journal")) + } + thoth_client::WorkType::BOOK_CHAPTER => { + Some(("http://purl.org/coar/resource_type/c_3248", "book part")) + } + thoth_client::WorkType::MONOGRAPH + | thoth_client::WorkType::TEXTBOOK + | thoth_client::WorkType::EDITED_BOOK + | thoth_client::WorkType::BOOK_SET => { + Some(("http://purl.org/coar/resource_type/c_2f33", "book")) + } + thoth_client::WorkType::Other(_) => None, + } +} + +fn normalized_license_name(license: &str) -> &str { + match license.trim_end_matches('/') { + "http://creativecommons.org/publicdomain/zero/1.0" => "CC0 1.0 Universal", + "http://creativecommons.org/licenses/by/4.0" => "CC BY 4.0", + "http://creativecommons.org/licenses/by-sa/4.0" => "CC BY-SA 4.0", + "http://creativecommons.org/licenses/by-nc/4.0" => "CC BY-NC 4.0", + "http://creativecommons.org/licenses/by-nc-sa/4.0" => "CC BY-NC-SA 4.0", + "http://creativecommons.org/licenses/by-nd/4.0" => "CC BY-ND 4.0", + "http://creativecommons.org/licenses/by-nc-nd/4.0" => "CC BY-NC-ND 4.0", + "http://creativecommons.org/licenses/by/3.0" => "CC BY 3.0", + "http://creativecommons.org/licenses/by-sa/3.0" => "CC BY-SA 3.0", + "http://creativecommons.org/licenses/by-nc/3.0" => "CC BY-NC 3.0", + "http://creativecommons.org/licenses/by-nc-sa/3.0" => "CC BY-NC-SA 3.0", + "http://creativecommons.org/licenses/by-nd/3.0" => "CC BY-ND 3.0", + "http://creativecommons.org/licenses/by-nc-nd/3.0" => "CC BY-NC-ND 3.0", + _ => license, + } +} + +fn parent_work(work: &Work) -> Option<&thoth_client::WorkRelationsRelatedWork> { + work.relations + .iter() + .find(|relation| relation.relation_type == RelationType::IS_CHILD_OF) + .map(|relation| &relation.related_work) +} + +fn timestamp_rfc3339(timestamp: thoth_api::model::Timestamp) -> String { + timestamp.to_rfc3339().replace("+00:00", "Z") +} + +fn reference_citation(reference: &thoth_client::WorkReferences) -> Option { + if let Some(unstructured) = reference.unstructured_citation.as_deref() { + return normalize_value(unstructured); + } + + let mut parts = Vec::new(); + if let Some(author) = reference.author.as_deref().and_then(normalize_value) { + parts.push(author); + } + if let Some(article_title) = reference.article_title.as_deref().and_then(normalize_value) { + parts.push(article_title); + } + if let Some(journal_title) = reference.journal_title.as_deref().and_then(normalize_value) { + parts.push(journal_title); + } + if let Some(series_title) = reference.series_title.as_deref().and_then(normalize_value) { + parts.push(series_title); + } + if let Some(volume_title) = reference.volume_title.as_deref().and_then(normalize_value) { + parts.push(volume_title); + } + if let Some(volume) = reference.volume.as_deref().and_then(normalize_value) { + parts.push(format!("vol. {volume}")); + } + if let Some(issue) = reference.issue.as_deref().and_then(normalize_value) { + parts.push(format!("issue {issue}")); + } + if let Some(first_page) = reference.first_page.as_deref().and_then(normalize_value) { + parts.push(format!("p. {first_page}")); + } + if let Some(component_number) = reference + .component_number + .as_deref() + .and_then(normalize_value) + { + parts.push(format!("component {component_number}")); + } + if let Some(edition) = reference.edition { + parts.push(format!("{edition} ed.")); + } + if let Some(publication_date) = reference.publication_date { + parts.push(publication_date.to_string()); + } + if let Some(doi) = &reference.doi { + parts.push(doi_url(doi)); + } + if let Some(isbn) = &reference.isbn { + parts.push(format!("ISBN {isbn}")); + } + if let Some(issn) = reference.issn.as_deref().and_then(normalize_value) { + parts.push(format!("ISSN {issn}")); + } + if let Some(standard_designator) = reference + .standard_designator + .as_deref() + .and_then(normalize_value) + { + parts.push(format!("std. {standard_designator}")); + } + if let Some(standards_body_name) = reference + .standards_body_name + .as_deref() + .and_then(normalize_value) + { + parts.push(standards_body_name); + } + if let Some(standards_body_acronym) = reference + .standards_body_acronym + .as_deref() + .and_then(normalize_value) + { + parts.push(standards_body_acronym); + } + if parts.is_empty() { + None + } else { + Some(parts.join(". ")) + } +} + +fn write_openaire(work: &Work, xml: &mut EventWriter) -> ThothResult<()> { + let root_attrs = OPENAIRE_NS + .iter() + .map(|(key, value)| (*key, (*value).to_string())) + .collect::>(); + push_open_tag(xml, "oaire:resource", &root_attrs)?; + + push_text_element_attrs( + xml, + "datacite:identifier", + &[("identifierType", "URL".to_string())], + &work_url(work), + )?; + + let mut title_entries = Vec::new(); + let mut title_seen = HashSet::new(); + for (index, title) in ordered_titles(work).iter().enumerate() { + if index == 0 { + let canonical_title = normalize_value(&title.title) + .or_else(|| normalize_value(&title.full_title)) + .unwrap_or_else(|| title.full_title.clone()); + if title_seen.insert(format!("canonical:{canonical_title}")) { + title_entries.push((Vec::new(), canonical_title)); + } + if let Some(subtitle) = title.subtitle.as_deref().and_then(normalize_value) { + if title_seen.insert(format!("subtitle:{subtitle}")) { + title_entries.push(( + vec![("titleType", "Subtitle".to_string())], + subtitle.to_string(), + )); + } + } + } else if let Some(full_title) = normalize_value(&title.full_title) { + if title_seen.insert(format!("alternative:{full_title}")) { + title_entries.push(( + vec![("titleType", "AlternativeTitle".to_string())], + full_title, + )); + } + } + } + if !title_entries.is_empty() { + push_open_tag(xml, "datacite:titles", &[])?; + for (attrs, value) in title_entries { + if attrs.is_empty() { + push_text_element(xml, "datacite:title", &value)?; + } else { + let attrs = attrs + .iter() + .map(|(key, value)| (*key, value.clone())) + .collect::>(); + push_text_element_attrs(xml, "datacite:title", &attrs, &value)?; + } + } + push_close_tag(xml, "datacite:titles")?; + } + + let creators = creators(work).collect::>(); + if !creators.is_empty() { + push_open_tag(xml, "datacite:creators", &[])?; + for creator in creators { + push_open_tag(xml, "datacite:creator", &[])?; + push_text_element_attrs( + xml, + "datacite:creatorName", + &[("nameType", "Personal".to_string())], + &personal_name(creator), + )?; + if let Some(first_name) = creator.first_name.as_deref() { + if !first_name.is_empty() { + push_text_element(xml, "datacite:givenName", first_name)?; + } + } + if !creator.last_name.is_empty() { + push_text_element(xml, "datacite:familyName", &creator.last_name)?; + } + if let Some(orcid) = &creator.contributor.orcid { + push_text_element_attrs( + xml, + "datacite:nameIdentifier", + &[ + ("nameIdentifierScheme", "ORCID".to_string()), + ("schemeURI", "https://orcid.org/".to_string()), + ], + &orcid_url(orcid), + )?; + } + for affiliation in &creator.affiliations { + if let Some(ror) = &affiliation.institution.ror { + push_text_element_attrs( + xml, + "datacite:affiliation", + &[("affiliationIdentifier", ror_url(ror))], + &affiliation.institution.institution_name, + )?; + } else { + push_text_element( + xml, + "datacite:affiliation", + &affiliation.institution.institution_name, + )?; + } + } + push_close_tag(xml, "datacite:creator")?; + } + push_close_tag(xml, "datacite:creators")?; + } + + let contributors = contributors(work).collect::>(); + if !contributors.is_empty() { + push_open_tag(xml, "datacite:contributors", &[])?; + for contributor in contributors { + let contributor_type = if contributor.contribution_type == ContributionType::EDITOR { + "Editor" + } else { + "Other" + }; + push_open_tag( + xml, + "datacite:contributor", + &[("contributorType", contributor_type.to_string())], + )?; + push_text_element_attrs( + xml, + "datacite:contributorName", + &[("nameType", "Personal".to_string())], + &personal_name(contributor), + )?; + if let Some(first_name) = contributor.first_name.as_deref() { + if !first_name.is_empty() { + push_text_element(xml, "datacite:givenName", first_name)?; + } + } + if !contributor.last_name.is_empty() { + push_text_element(xml, "datacite:familyName", &contributor.last_name)?; + } + if let Some(orcid) = &contributor.contributor.orcid { + push_text_element_attrs( + xml, + "datacite:nameIdentifier", + &[ + ("nameIdentifierScheme", "ORCID".to_string()), + ("schemeURI", "https://orcid.org/".to_string()), + ], + &orcid_url(orcid), + )?; + } + for affiliation in &contributor.affiliations { + if let Some(ror) = &affiliation.institution.ror { + push_text_element_attrs( + xml, + "datacite:affiliation", + &[("affiliationIdentifier", ror_url(ror))], + &affiliation.institution.institution_name, + )?; + } else { + push_text_element( + xml, + "datacite:affiliation", + &affiliation.institution.institution_name, + )?; + } + } + push_close_tag(xml, "datacite:contributor")?; + } + push_close_tag(xml, "datacite:contributors")?; + } + + if !work.fundings.is_empty() { + push_open_tag(xml, "oaire:fundingReferences", &[])?; + for funding in &work.fundings { + push_open_tag(xml, "oaire:fundingReference", &[])?; + push_text_element( + xml, + "oaire:funderName", + &funding.institution.institution_name, + )?; + if let Some(ror) = &funding.institution.ror { + push_text_element_attrs( + xml, + "oaire:funderIdentifier", + &[("funderIdentifierType", "ROR".to_string())], + &ror_url(ror), + )?; + } + if let Some(grant_number) = &funding.grant_number { + push_text_element(xml, "oaire:awardNumber", grant_number)?; + } + if let Some(project_name) = &funding.project_name { + push_text_element(xml, "oaire:awardTitle", project_name)?; + } + push_close_tag(xml, "oaire:fundingReference")?; + } + push_close_tag(xml, "oaire:fundingReferences")?; + } + + let mut alternate_identifiers = Vec::new(); + let mut alternate_identifier_seen = HashSet::new(); + let mut push_alternate_identifier = |identifier_type: &str, value: String| { + if let Some(value) = normalize_value(&value) { + let key = (identifier_type.to_string(), value.clone()); + if alternate_identifier_seen.insert(key.clone()) { + alternate_identifiers.push(key); + } + } + }; + if let Some(doi) = &work.doi { + push_alternate_identifier("DOI", doi_url(doi)); + } + if let Some(landing_page) = work.landing_page.as_deref() { + push_alternate_identifier("URL", landing_page.to_string()); + } + push_alternate_identifier("OAI", oai_identifier(work.work_id)); + if let Some(lccn) = work.lccn.as_deref() { + push_alternate_identifier("LCCN", lccn.to_string()); + } + if let Some(oclc) = work.oclc.as_deref() { + push_alternate_identifier("OCLC", oclc.to_string()); + } + for publication in &work.publications { + if let Some(isbn) = &publication.isbn { + push_alternate_identifier("ISBN", isbn.to_string()); + } + } + if !alternate_identifiers.is_empty() { + push_open_tag(xml, "datacite:alternateIdentifiers", &[])?; + for (identifier_type, value) in alternate_identifiers { + push_text_element_attrs( + xml, + "datacite:alternateIdentifier", + &[("alternateIdentifierType", identifier_type)], + &value, + )?; + } + push_close_tag(xml, "datacite:alternateIdentifiers")?; + } + + let mut related_identifiers = Vec::new(); + let mut related_identifier_seen = HashSet::new(); + for relation in &work.relations { + let relation_type = if matches!( + relation.relation_type, + RelationType::HAS_CHILD | RelationType::HAS_PART + ) { + "HasPart" + } else { + "IsPartOf" + }; + + if let Some(doi) = &relation.related_work.doi { + let value = doi_url(doi); + if let Some(value) = normalize_value(&value) { + let key = ("DOI".to_string(), relation_type.to_string(), value.clone()); + if related_identifier_seen.insert(key.clone()) { + related_identifiers.push(key); + } + } + } + if let Some(landing_page) = relation.related_work.landing_page.as_deref() { + if let Some(value) = normalize_value(landing_page) { + let key = ("URL".to_string(), relation_type.to_string(), value.clone()); + if related_identifier_seen.insert(key.clone()) { + related_identifiers.push(key); + } + } + } + for publication in &relation.related_work.publications { + if let Some(isbn) = &publication.isbn { + let value = isbn.to_string(); + if let Some(value) = normalize_value(&value) { + let key = ("ISBN".to_string(), relation_type.to_string(), value.clone()); + if related_identifier_seen.insert(key.clone()) { + related_identifiers.push(key); + } + } + } + } + } + if !related_identifiers.is_empty() { + push_open_tag(xml, "datacite:relatedIdentifiers", &[])?; + for (identifier_type, relation_type, value) in related_identifiers { + push_text_element_attrs( + xml, + "datacite:relatedIdentifier", + &[ + ("relatedIdentifierType", identifier_type), + ("relationType", relation_type), + ], + &value, + )?; + } + push_close_tag(xml, "datacite:relatedIdentifiers")?; + } + + let mut language_values = Vec::new(); + let mut language_seen = HashSet::new(); + for language in ordered_languages(work) { + push_unique( + &mut language_values, + &mut language_seen, + language.language_code.to_string().to_lowercase(), + ); + } + for language in language_values { + push_text_element(xml, "dc:language", &language)?; + } + + push_text_element(xml, "dc:publisher", &work.imprint.publisher.publisher_name)?; + + if let Some(publication_date) = &work.publication_date { + push_text_element_attrs( + xml, + "datacite:date", + &[("dateType", "Issued".to_string())], + &publication_date.to_string(), + )?; + } + push_text_element( + xml, + "dcterms:modified", + ×tamp_rfc3339(work.updated_at_with_relations), + )?; + + if let Some((uri, value)) = openaire_resource_type(work) { + push_text_element_attrs( + xml, + "oaire:resourceType", + &[ + ("resourceTypeGeneral", "literature".to_string()), + ("uri", uri.to_string()), + ], + value, + )?; + } + + let mut description_values = Vec::new(); + let mut description_seen = HashSet::new(); + for abstract_record in ordered_abstracts(work) { + push_unique( + &mut description_values, + &mut description_seen, + convert_abstract_to_text(abstract_record)?, + ); + } + if let Some(toc) = work.toc.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + toc.to_string(), + ); + } + if let Some(general_note) = work.general_note.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + general_note.to_string(), + ); + } + if let Some(bibliography_note) = work.bibliography_note.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + bibliography_note.to_string(), + ); + } + if let Some(cover_caption) = work.cover_caption.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + cover_caption.to_string(), + ); + } + if let Some(page_breakdown) = work.page_breakdown.as_deref() { + push_unique( + &mut description_values, + &mut description_seen, + page_breakdown.to_string(), + ); + } + for description in description_values { + push_text_element(xml, "dc:description", &description)?; + } + + let mut format_values = Vec::new(); + let mut format_seen = HashSet::new(); + for publication in &work.publications { + push_unique( + &mut format_values, + &mut format_seen, + publication_type_value(&publication.publication_type).to_string(), + ); + } + for format_value in format_values { + push_text_element(xml, "dc:format", &format_value)?; + } + + if let Some(license) = &work.license { + push_text_element_attrs( + xml, + "datacite:rights", + &[( + "rightsURI", + "http://purl.org/coar/access_right/c_abf2".to_string(), + )], + "open access", + )?; + push_text_element_attrs( + xml, + "oaire:licenseCondition", + &[("uri", license.clone())], + normalized_license_name(license), + )?; + } else { + push_text_element_attrs( + xml, + "datacite:rights", + &[( + "rightsURI", + "http://purl.org/coar/access_right/c_16ec".to_string(), + )], + "restricted access", + )?; + } + if let Some(copyright_holder) = work.copyright_holder.as_deref() { + push_text_element( + xml, + "datacite:rights", + &format!("Copyright holder: {copyright_holder}"), + )?; + } + + let mut subject_entries = Vec::new(); + let mut subject_seen = HashSet::new(); + for subject in &work.subjects { + let entry = match subject.subject_type { + SubjectType::KEYWORD | SubjectType::CUSTOM => { + (Vec::new(), subject.subject_code.to_string()) + } + SubjectType::THEMA => ( + vec![("subjectScheme", "Thema".to_string())], + subject.subject_code.to_string(), + ), + _ => ( + vec![("subjectScheme", subject.subject_type.to_string())], + subject.subject_code.to_string(), + ), + }; + let signature = format!( + "{}|{}", + entry + .0 + .iter() + .map(|(key, value)| format!("{key}={value}")) + .collect::>() + .join("&"), + entry.1 + ); + if subject_seen.insert(signature) { + subject_entries.push(entry); + } + } + for (attrs, value) in subject_entries { + if attrs.is_empty() { + push_text_element(xml, "datacite:subject", &value)?; + } else { + let attrs = attrs + .iter() + .map(|(key, value)| (*key, value.clone())) + .collect::>(); + push_text_element_attrs(xml, "datacite:subject", &attrs, &value)?; + } + } + + let mut sizes = Vec::new(); + if let Some(page_count) = work.page_count { + sizes.push(format!("{page_count} pages")); + } + if let Some(image_count) = work.image_count { + sizes.push(format!("{image_count} images")); + } + if let Some(table_count) = work.table_count { + sizes.push(format!("{table_count} tables")); + } + if let Some(audio_count) = work.audio_count { + sizes.push(format!("{audio_count} audios")); + } + if let Some(video_count) = work.video_count { + sizes.push(format!("{video_count} videos")); + } + if !sizes.is_empty() { + push_open_tag(xml, "datacite:sizes", &[])?; + for size in sizes { + push_text_element(xml, "datacite:size", &size)?; + } + push_close_tag(xml, "datacite:sizes")?; + } + + for publication in &work.publications { + for location in &publication.locations { + if let Some(full_text_url) = &location.full_text_url { + push_text_element_attrs( + xml, + "oaire:file", + &[ + ( + "mimeType", + publication_type_value(&publication.publication_type).to_string(), + ), + ("objectType", "fulltext".to_string()), + ], + full_text_url, + )?; + } + } + } + + let issue = work.issues.first(); + if work.work_type == thoth_client::WorkType::BOOK_CHAPTER { + if let Some(parent_work) = parent_work(work) { + if let Some(parent_title) = parent_work + .titles + .iter() + .find(|title| title.canonical) + .or_else(|| parent_work.titles.first()) + { + push_text_element(xml, "oaire:citationTitle", &parent_title.full_title)?; + } else if let Some(title) = canonical_title(work) { + push_text_element(xml, "oaire:citationTitle", &title.full_title)?; + } + if let Some(edition) = parent_work.edition.or(work.edition) { + push_text_element(xml, "oaire:citationEdition", &edition.to_string())?; + } + } else if let Some(title) = canonical_title(work) { + push_text_element(xml, "oaire:citationTitle", &title.full_title)?; + } + } else if let Some(issue) = issue { + push_text_element(xml, "oaire:citationTitle", &issue.series.series_name)?; + let citation_issue = issue + .issue_number + .map(|value| value.to_string()) + .and_then(|value| normalize_value(&value)) + .unwrap_or_else(|| issue.issue_ordinal.to_string()); + push_text_element(xml, "oaire:citationIssue", &citation_issue)?; + } else if let Some(title) = canonical_title(work) { + push_text_element(xml, "oaire:citationTitle", &title.full_title)?; + } + + if let Some(first_page) = &work.first_page { + push_text_element(xml, "oaire:citationStartPage", first_page)?; + } + if let Some(last_page) = &work.last_page { + push_text_element(xml, "oaire:citationEndPage", last_page)?; + } + + let mut citation_values = Vec::new(); + let mut citation_seen = HashSet::new(); + for reference in &work.references { + if let Some(citation) = reference_citation(reference) { + push_unique(&mut citation_values, &mut citation_seen, citation); + } + } + for citation in citation_values { + push_text_element(xml, "dcterms:bibliographicCitation", &citation)?; + } + + push_close_tag(xml, "oaire:resource") +} + +#[cfg(test)] +fn map_openaire(work: &Work) -> ThothResult { + let mut buffer = Vec::new(); + let mut writer = EmitterConfig::new() + .perform_indent(true) + .create_writer(&mut buffer); + XmlElementBlock::::xml_element(work, &mut writer) + .map(|_| buffer) + .and_then(|xml| { + String::from_utf8(xml) + .map_err(|_| ThothError::InternalError("Could not parse XML".to_string())) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::record::XML_DECLARATION; + use crate::xml::dublincore_thoth::test_support::{assert_valid_against_schema, fixture_work}; + + fn assert_precedes(xml: &str, first: &str, second: &str) { + let first_pos = xml + .find(first) + .unwrap_or_else(|| panic!("Could not find `{first}` in XML output")); + let second_pos = xml + .find(second) + .unwrap_or_else(|| panic!("Could not find `{second}` in XML output")); + assert!( + first_pos < second_pos, + "Expected `{first}` to appear before `{second}`" + ); + } + + #[test] + fn openaire_mapping_is_exhaustive_and_schema_clean() { + let work = fixture_work(); + let xml = map_openaire(&work).expect("map openaire"); + + assert!(xml.contains("Canonical Title")); + assert!(xml.contains( + "Alternativer Titel" + )); + assert!(xml.contains("Hopper, Grace")); + assert!(!xml.contains( + "Hopper, Grace" + )); + assert!(xml.contains("oai:thoth.pub:00000000-0000-0000-0000-000000000111")); + assert!(xml.contains("2023123456")); + assert!(xml.contains("123456789")); + assert!(xml.contains("2024-12-31T12:00:00Z")); + assert!(xml.contains("Doe, J. (2020). The Open Knowledge Handbook.")); + assert!(xml.contains("Smith, Jane. Structured Citation Patterns. Metadata Quarterly.")); + assert!(xml.contains("Ausführliche Zusammenfassung.")); + assert!(xml.contains("https://example.org/books/111.pdf")); + assert!(xml.contains("CC BY 4.0")); + assert!(!xml.contains("")); + assert_precedes( + &xml, + "", + "", + ); + assert_precedes(&xml, "", ""); + assert_precedes( + &xml, + "Open Access Series", + "7", + ); + + assert_valid_against_schema(&xml, "oai_openaire.xsd"); + } + + #[test] + fn generator_returns_single_work_xml_with_declaration() { + let xml = OpenaireThoth {} + .generate(&[fixture_work()], None) + .expect("single openaire"); + assert!(xml.starts_with(XML_DECLARATION)); + assert!(!xml.starts_with(&format!("{XML_DECLARATION}\n"))); + assert!(xml.contains("", "Ross Higman "] +edition = "2021" +license = "Apache-2.0" +description = "Actix instance serving Thoth's OAI-PMH endpoints" +repository = "https://github.com/thoth-pub/thoth" +readme = "README.md" + +[dependencies] +thoth-api = { version = "=1.0.5", path = "../thoth-api", features = ["backend"] } +thoth-errors = { version = "=1.0.5", path = "../thoth-errors" } +thoth-client = { version = "=1.0.5", path = "../thoth-client" } +actix-cors = "0.7.1" +actix-web = "4.10" +async-trait = "0.1.89" +env_logger = "0.11.7" +oai-pmh = { package = "oai-pmh-rs", version = "0.2.0", features = ["actix"] } +quick-xml = "0.39" +reqwest = { version = "0.13", features = ["json"] } +uuid = { version = "1.16.0", features = ["serde"] } + +[dev-dependencies] +base64 = "0.22.1" +chrono = { version = "0.4.40", features = ["serde"] } +flate2 = "1.1.1" +serde_json = "1.0" diff --git a/thoth-oai-server/README.md b/thoth-oai-server/README.md new file mode 100644 index 000000000..f8fd3ab50 --- /dev/null +++ b/thoth-oai-server/README.md @@ -0,0 +1,17 @@ +
+ + +

Thoth OAI-PMH

+ +

+ Web server for Thoth's, metadata management and dissemination system, OAI-PMH +

+ +

+ GitHub Workflow + Thoth Releases + Crate Info + License Info +

+
+ diff --git a/thoth-oai-server/assets/oai2.xsl b/thoth-oai-server/assets/oai2.xsl new file mode 100644 index 000000000..0d519fb03 --- /dev/null +++ b/thoth-oai-server/assets/oai2.xsl @@ -0,0 +1,695 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Thoth OAI-PMH + + + +
+
+ +
+

Thoth OAI-PMH

+

Human-readable view of OAI-PMH responses.

+ +
+
+ +
+

Rendered by Thoth's OAI stylesheet. Use your browser's "View Source" to inspect raw XML.

+
+
+ + +
+ + + + + + +
+

Response Overview

+ + + + + + + +
Response Date
Request URL
Verb + + + + + unknown + +
+
+ + + + + + + + + + + + + +
+ + +
+

OAI Error:

+

+
+
+ + +
+

Identify

+ + + + + + + + + + + + + +
Repository Name
Base URL
Protocol Version
Earliest Datestamp
Deleted Record Policy
Granularity
Compression
Admin Email
+ +
+
+ + +

OAI Identifier

+ + + + + +
Scheme
Repository Identifier
Delimiter
Sample Identifier
+
+ + +

Thoth Repository Metadata

+ + + + + + + +
Latest Datestamp
Rights Management
Rights URL + + + + + Not provided + +
+
+ + +

Description (Additional Metadata)

+
+ +
+
+ + +
+

GetRecord

+ +
+
+ + +
+

ListRecords

+ + +
+
+ + +
+

ListIdentifiers

+ + +
+
+ + +
+

ListSets

+ + +
+
+ + +
+

Set:

+
+ + + +
Set Spec
Set Name
+ +
+
+
+ + +

Set Description

+
+ +
+
+ + +
+

ListMetadataFormats

+ +

+ Formats for identifier: + +

+
+ +
+
+ + +
+

Metadata Format:

+
+ + + + + + + +
metadataPrefix + + ListRecords +
metadataNamespace
schema
+ +

+ + View this record in + +

+
+
+
+
+ + +
+

Record:

+
+ + + +
+
+
+ + +

Header

+ + + + + + + + + + + + + + + +
Identifier + + Formats +
Datestamp
Set Spec + + + Identifiers + Records + +
Statusdeleted
+
+ + +

Metadata

+ + + + + +

No metadata payload for this record.

+
+
+
+ + +

About

+ + +
+ +
+
+ +

No additional about metadata.

+
+
+
+ + +
+

Dublin Core (oai_dc)

+
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + +
+

OpenAIRE (oai_openaire)

+
+
+ +
+
+
+
+ + +
+

MARCXML (marcxml)

+
+
+ +
+
+
+
+ + +
+

Metadata (Unsupported Format)

+
+
+ +
+
+
+
+ + +
+

Resumption Token

+
+ + +

More results are available.

+ + + + + + + + + + + + + + + +
expirationDate
completeListSize
cursor
token
resume + Resume Listing +
+
+ +

End of list. This empty token marks a terminal page.

+
+
+
+
+
+ + +
+ <> + + </> +
+
+ + + + ="" + + + + + + + + +
diff --git a/thoth-oai-server/src/lib.rs b/thoth-oai-server/src/lib.rs new file mode 100644 index 000000000..a4af1336e --- /dev/null +++ b/thoth-oai-server/src/lib.rs @@ -0,0 +1,2262 @@ +#![recursion_limit = "512"] + +mod service; + +use std::{io, time::Duration}; + +use actix_cors::Cors; +use actix_web::{middleware::Logger, web, App, HttpRequest, HttpResponse, HttpServer}; +use async_trait::async_trait; +use oai_pmh::{ + core::{ + IdentifyData, InternalErrorClass, MetadataPrefix, OaiProvider, + ProtocolError as CoreProtocolError, ProviderError, RecordHeader, SetRecord, + }, + transport::actix as oai_actix, +}; +use quick_xml::escape::escape; +use service::{OaiService, ADMIN_EMAIL, RECORD_PREFIX, REPOSITORY_NAME, SAMPLE_ID}; +use thoth_client::Work; +use thoth_errors::ThothError; +use uuid::Uuid; + +const LOG_FORMAT: &str = r#"%{r}a %a "%r" %s %b "%{Referer}i" "%{User-Agent}i" %T"#; +const XSL_STYLESHEET: &str = include_str!("../assets/oai2.xsl"); +const METADATA_RIGHTS_STATEMENT: &str = "Metadata is licensed under the terms of Creative Commons CC0 1.0 Universal: https://creativecommons.org/publicdomain/zero/1.0/."; +const METADATA_RIGHTS_URI: &str = "https://creativecommons.org/publicdomain/zero/1.0/"; +#[cfg(test)] +const DEFAULT_RETRY_AFTER_SECONDS: u64 = 30; + +#[derive(Clone)] +struct AppState { + service: OaiService, + retry_after_seconds: u64, +} + +async fn stylesheet() -> HttpResponse { + HttpResponse::Ok() + .content_type("text/xsl; charset=utf-8") + .body(XSL_STYLESHEET) +} + +async fn oai_get(request: HttpRequest, state: web::Data) -> HttpResponse { + let state = web::Data::new(oai_actix::AppState { + provider: state.service.clone(), + retry_after_seconds: state.retry_after_seconds, + }); + oai_actix::oai_get(request, state).await +} + +async fn oai_post( + request: HttpRequest, + body: web::Bytes, + state: web::Data, +) -> HttpResponse { + let state = web::Data::new(oai_actix::AppState { + provider: state.service.clone(), + retry_after_seconds: state.retry_after_seconds, + }); + oai_actix::oai_post(request, body, state).await +} + +fn is_transient_upstream_error(error: &ThothError) -> bool { + let message = match error { + ThothError::RequestError(message) | ThothError::GraphqlError(message) => { + message.to_ascii_lowercase() + } + _ => return false, + }; + + let has_transient_status = [500, 502, 503, 504, 429].iter().any(|status| { + message.contains(&format!("graphql {status}")) + || message.contains(&format!("export {status}")) + || message.contains(&status.to_string()) + }); + let has_network_failure = [ + "timed out", + "timeout", + "connection refused", + "connection reset", + "error sending request", + "temporary failure", + "dns error", + "failed to lookup address", + ] + .iter() + .any(|needle| message.contains(needle)); + + has_transient_status || has_network_failure +} + +fn parse_identifier_for_lookup_core(identifier: &str) -> Result { + OaiService::parse_oai_identifier(identifier).map_err(|_| CoreProtocolError { + code: "idDoesNotExist", + message: "The requested identifier does not exist".to_string(), + }) +} + +fn map_get_record_provider_error( + metadata_prefix: MetadataPrefix, + error: ThothError, +) -> ProviderError { + match error { + ThothError::EntityNotFound => ProviderError::Protocol(CoreProtocolError { + code: "idDoesNotExist", + message: "The requested identifier does not exist".to_string(), + }), + ThothError::IncompleteMetadataRecord(_, _) + | ThothError::InvalidMetadataSpecification(_) => { + ProviderError::Protocol(CoreProtocolError { + code: "cannotDisseminateFormat", + message: format!( + "Record cannot be disseminated as {}", + metadata_prefix.as_str() + ), + }) + } + other => ProviderError::Internal(other), + } +} + +fn map_export_metadata_provider_error( + metadata_prefix: MetadataPrefix, + error: ThothError, +) -> ProviderError { + match error { + ThothError::RequestError(_) if is_transient_upstream_error(&error) => { + ProviderError::Internal(error) + } + _ => ProviderError::Protocol(CoreProtocolError { + code: "cannotDisseminateFormat", + message: format!( + "Record cannot be disseminated as {}", + metadata_prefix.as_str() + ), + }), + } +} + +fn map_list_source_provider_error(error: ThothError) -> ProviderError { + match error { + ThothError::EntityNotFound => ProviderError::Protocol(CoreProtocolError { + code: "noRecordsMatch", + message: "The request matched no records".to_string(), + }), + other => ProviderError::Internal(other), + } +} + +fn header_from_work(work: &Work) -> RecordHeader { + let set_spec = OaiService::set_spec(&work.imprint.publisher.publisher_name); + RecordHeader { + identifier: OaiService::oai_identifier(work.work_id), + datestamp: OaiService::timestamp_xml(work.updated_at_with_relations), + set_spec, + about_xml: Vec::new(), + } +} + +#[async_trait] +impl OaiProvider for OaiService { + type Error = ThothError; + type ListEntry = Work; + + fn repository_url(&self) -> String { + self.repository_url() + } + + async fn identify(&self) -> Result> { + let earliest = self.earliest().await.map_err(ProviderError::Internal)?; + let latest = self.latest().await.map_err(ProviderError::Internal)?; + Ok(IdentifyData { + repository_name: REPOSITORY_NAME.to_string(), + base_url: self.repository_url(), + admin_email: ADMIN_EMAIL.to_string(), + earliest_datestamp: OaiService::timestamp_xml(earliest), + deleted_record: "no".to_string(), + granularity: "YYYY-MM-DDThh:mm:ssZ".to_string(), + compressions: vec!["gzip".to_string()], + identifier_scheme: "oai".to_string(), + repository_identifier: "thoth.pub".to_string(), + identifier_delimiter: ":".to_string(), + sample_identifier: format!("{RECORD_PREFIX}:{SAMPLE_ID}"), + descriptions: vec![format!( + "{}{}{}", + escape(OaiService::timestamp_xml(latest)), + escape(METADATA_RIGHTS_STATEMENT), + escape(METADATA_RIGHTS_URI), + )], + }) + } + + async fn list_metadata_formats( + &self, + identifier: Option<&str>, + ) -> Result, ProviderError> { + let mut prefixes = vec![ + MetadataPrefix::OaiDc, + MetadataPrefix::OaiOpenaire, + MetadataPrefix::MarcXml, + ]; + if let Some(identifier) = identifier { + let work_id = parse_identifier_for_lookup_core(identifier)?; + self.get_record(work_id, MetadataPrefix::OaiDc) + .await + .map_err(|error| map_get_record_provider_error(MetadataPrefix::OaiDc, error))?; + prefixes.clear(); + for prefix in [ + MetadataPrefix::OaiDc, + MetadataPrefix::OaiOpenaire, + MetadataPrefix::MarcXml, + ] { + match self.has_metadata_dissemination(work_id, prefix).await { + Ok(true) => prefixes.push(prefix), + Ok(false) => {} + Err(error) if is_transient_upstream_error(&error) => { + return Err(ProviderError::Internal(error)); + } + Err(_) => {} + } + } + } + Ok(prefixes) + } + + async fn list_sets(&self) -> Result, ProviderError> { + OaiService::list_sets(self) + .await + .map_err(ProviderError::Internal) + .map(|sets| { + sets.into_iter() + .map(|set| SetRecord { + spec: set.spec, + name: set.name, + description_xml: None, + }) + .collect() + }) + } + + async fn get_record_header( + &self, + identifier: &str, + metadata_prefix: MetadataPrefix, + ) -> Result> { + let work_id = parse_identifier_for_lookup_core(identifier)?; + let work = self + .get_record(work_id, metadata_prefix) + .await + .map_err(|error| map_get_record_provider_error(metadata_prefix, error))?; + Ok(header_from_work(&work)) + } + + async fn get_record_metadata( + &self, + identifier: &str, + metadata_prefix: MetadataPrefix, + ) -> Result> { + let work_id = parse_identifier_for_lookup_core(identifier)?; + let metadata = match metadata_prefix { + MetadataPrefix::OaiDc => self.get_oai_dc_record(work_id).await, + MetadataPrefix::OaiOpenaire => self.get_oai_openaire_record(work_id).await, + MetadataPrefix::MarcXml => self.get_marcxml_record(work_id).await, + }; + metadata.map_err(|error| map_export_metadata_provider_error(metadata_prefix, error)) + } + + async fn list_source_count( + &self, + metadata_prefix: MetadataPrefix, + set: Option<&str>, + ) -> Result> { + self.list_source_count(metadata_prefix, set) + .await + .map_err(map_list_source_provider_error) + } + + async fn list_source_batch( + &self, + metadata_prefix: MetadataPrefix, + set: Option<&str>, + offset: i64, + limit: i64, + ) -> Result, ProviderError> { + self.list_source_batch(metadata_prefix, set, offset, limit) + .await + .map_err(map_list_source_provider_error) + } + + fn list_entry_header(&self, entry: &Self::ListEntry) -> RecordHeader { + header_from_work(entry) + } + + async fn list_entry_disseminatable( + &self, + entry: &Self::ListEntry, + metadata_prefix: MetadataPrefix, + ) -> Result> { + self.has_metadata_dissemination(entry.work_id, metadata_prefix) + .await + .map_err(ProviderError::Internal) + } + + fn include_complete_list_size( + &self, + metadata_prefix: MetadataPrefix, + date_filter_active: bool, + ) -> bool { + metadata_prefix != MetadataPrefix::MarcXml && !date_filter_active + } + + fn classify_internal_error(&self, error: &Self::Error) -> InternalErrorClass { + if is_transient_upstream_error(error) { + InternalErrorClass::Transient + } else { + InternalErrorClass::Fatal + } + } +} + +async fn not_found() -> HttpResponse { + HttpResponse::NotFound() + .content_type("text/html; charset=utf-8") + .body( + r##" + + + 404 - Page Not Found + + + + + + + + + + + + + + + + + + + + + +

404 - Page Not Found

+

The requested page was not found.

+

OAI-PMH Interface

+ +"##, + ) +} + +#[actix_web::main] +#[allow(clippy::too_many_arguments)] +pub async fn start_server( + host: String, + port: String, + threads: usize, + keep_alive: u64, + public_url: String, + gql_endpoint: String, + export_url: String, + retry_after_seconds: u64, +) -> io::Result<()> { + env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); + let state = AppState { + service: OaiService::new(public_url, gql_endpoint, export_url), + retry_after_seconds, + }; + + HttpServer::new(move || { + App::new() + .wrap(Logger::new(LOG_FORMAT)) + .wrap(Cors::default().allowed_methods(vec!["GET", "POST", "OPTIONS"])) + .app_data(web::Data::new(state.clone())) + .service( + web::resource("/") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ) + .service(web::resource("/oai2.xsl").route(web::get().to(stylesheet))) + .default_service(web::route().to(not_found)) + }) + .workers(threads) + .keep_alive(Duration::from_secs(keep_alive)) + .bind(format!("{host}:{port}"))? + .run() + .await +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::service::PAGE_LIMIT; + use actix_web::{dev::ServerHandle, http::header, test, App, HttpResponse, HttpServer}; + use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine as _}; + use chrono::{Duration, NaiveDate}; + use flate2::read::GzDecoder; + use oai_pmh::core::{ + decode_resumption_token, encode_resumption_token, DatestampGranularity, ResumptionToken, + }; + use serde_json::{json, Value}; + use std::{collections::HashSet, io::Read, net::TcpListener}; + + const PUBLISHER_ID: &str = "00000000-0000-0000-1111-000000000001"; + const PUBLISHER_NAME: &str = "Open Access Press"; + + #[derive(Clone)] + struct MockGraphqlState { + works: Vec, + publishers: Vec, + latest: String, + earliest: String, + } + + #[derive(Clone, Default)] + struct MockExportState { + failing_work_ids: HashSet, + non_disseminatable_work_ids: HashSet, + marc_non_disseminatable_work_ids: HashSet, + malformed_work_ids: HashSet, + } + + struct RunningMockServer { + base_url: String, + handle: ServerHandle, + } + + impl RunningMockServer { + async fn stop(self) { + self.handle.stop(true).await; + } + } + + async fn spawn_graphql_server(state: MockGraphqlState) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind graphql mock server"); + let address = listener.local_addr().expect("graphql local address"); + let state = web::Data::new(state); + + let server = HttpServer::new(move || { + App::new() + .app_data(state.clone()) + .route("/graphql", web::post().to(graphql_mock_handler)) + }) + .listen(listener) + .expect("listen graphql mock server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + + async fn spawn_graphql_error_server(status: actix_web::http::StatusCode) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind graphql error server"); + let address = listener.local_addr().expect("graphql error local address"); + + let server = HttpServer::new(move || { + App::new().route( + "/graphql", + web::post().to(move || async move { + HttpResponse::build(status) + .content_type("application/json; charset=utf-8") + .body(r#"{"errors":[{"message":"upstream failure"}]}"#) + }), + ) + }) + .listen(listener) + .expect("listen graphql error server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + + async fn spawn_export_server(state: MockExportState) -> RunningMockServer { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind export mock server"); + let address = listener.local_addr().expect("export local address"); + let state = web::Data::new(state); + + let server = HttpServer::new(move || { + App::new().app_data(state.clone()).route( + "/specifications/{specification_id}/work/{work_id}", + web::get().to(export_mock_handler), + ) + }) + .listen(listener) + .expect("listen export mock server") + .run(); + let handle = server.handle(); + actix_web::rt::spawn(server); + + RunningMockServer { + base_url: format!("http://{address}"), + handle, + } + } + + async fn graphql_mock_handler( + state: web::Data, + payload: web::Json, + ) -> HttpResponse { + let payload = payload.into_inner(); + let variables = payload + .get("variables") + .cloned() + .unwrap_or_else(|| json!({})); + let operation_name = request_operation_name(&payload); + + let response = match operation_name.as_deref() { + Some("OaiLatestWorksUpdatedQuery") => { + json!({ "data": { "works": [{ "updatedAtWithRelations": state.latest.clone() }] } }) + } + Some("OaiEarliestWorksUpdatedQuery") => { + json!({ "data": { "works": [{ "updatedAtWithRelations": state.earliest.clone() }] } }) + } + Some("PublishersQuery") => { + json!({ "data": { "publishers": state.publishers.clone() } }) + } + Some("WorkQuery") => { + let work_id = variables + .get("workId") + .and_then(Value::as_str) + .map(ToOwned::to_owned); + match work_id.and_then(|work_id| find_work_by_id(&state, &work_id)) { + Some(work) => json!({ "data": { "work": work } }), + None => json!({ "errors": [{ "message": "work not found" }] }), + } + } + Some("OaiWorkCountQuery") => { + let works = filter_works_by_publishers(&state, &variables); + json!({ "data": { "workCount": works.len() as i64 } }) + } + Some("OaiBookCountQuery") => { + let works = filter_works_by_publishers(&state, &variables); + json!({ "data": { "bookCount": works.len() as i64 } }) + } + Some("OaiWorksQuery") => { + let works = + paginate_works(filter_works_by_publishers(&state, &variables), &variables); + json!({ "data": { "works": works } }) + } + Some("OaiBooksQuery") => { + let works = + paginate_works(filter_works_by_publishers(&state, &variables), &variables); + json!({ "data": { "books": works } }) + } + _ => json!({ "errors": [{ "message": "unsupported operation" }] }), + }; + + HttpResponse::Ok().json(response) + } + + async fn export_mock_handler( + state: web::Data, + path: web::Path<(String, Uuid)>, + ) -> HttpResponse { + let (specification_id, work_id) = path.into_inner(); + let is_marc = specification_id == "marc21xml::thoth"; + if state.failing_work_ids.contains(&work_id) { + return HttpResponse::InternalServerError() + .content_type("text/plain; charset=utf-8") + .body("export failed"); + } + if state.non_disseminatable_work_ids.contains(&work_id) + || (is_marc && state.marc_non_disseminatable_work_ids.contains(&work_id)) + { + return HttpResponse::NotFound() + .content_type("text/plain; charset=utf-8") + .body("record not available"); + } + if is_marc && state.malformed_work_ids.contains(&work_id) { + return HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(""); + } + if specification_id == "dublin_core::thoth" { + return HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(format!( + r#"{work_id}Export delegated OAI DC"# + )); + } + if specification_id == "openaire::thoth" { + return HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(format!( + r#"{work_id}Export delegated OpenAIRE"# + )); + } + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(format!( + r#" + + + 00000nam a2200000 i 4500 + {work_id} + +"# + )) + } + + fn request_operation_name(payload: &Value) -> Option { + payload + .get("operationName") + .and_then(Value::as_str) + .map(ToOwned::to_owned) + .or_else(|| { + let query = payload.get("query").and_then(Value::as_str)?; + [ + "OaiLatestWorksUpdatedQuery", + "OaiEarliestWorksUpdatedQuery", + "PublishersQuery", + "WorkQuery", + "OaiWorkCountQuery", + "OaiBookCountQuery", + "OaiWorksQuery", + "OaiBooksQuery", + ] + .iter() + .find(|name| query.contains(**name)) + .map(|name| (*name).to_string()) + }) + } + + fn find_work_by_id(state: &MockGraphqlState, work_id: &str) -> Option { + state + .works + .iter() + .find(|work| work.get("workId").and_then(Value::as_str) == Some(work_id)) + .cloned() + } + + fn filter_works_by_publishers(state: &MockGraphqlState, variables: &Value) -> Vec { + let Some(publishers) = variables.get("publishers") else { + return state.works.clone(); + }; + if publishers.is_null() { + return state.works.clone(); + } + let Some(ids) = publishers.as_array() else { + return state.works.clone(); + }; + if ids.is_empty() { + return Vec::new(); + } + let allowed_names = ids + .iter() + .filter_map(Value::as_str) + .filter_map(|publisher_id| { + state + .publishers + .iter() + .find(|publisher| { + publisher.get("publisherId").and_then(Value::as_str) == Some(publisher_id) + }) + .and_then(|publisher| publisher.get("publisherName").and_then(Value::as_str)) + .map(ToOwned::to_owned) + }) + .collect::>(); + state + .works + .iter() + .filter(|work| { + work.get("imprint") + .and_then(|imprint| imprint.get("publisher")) + .and_then(|publisher| publisher.get("publisherName")) + .and_then(Value::as_str) + .is_some_and(|publisher_name| allowed_names.contains(publisher_name)) + }) + .cloned() + .collect() + } + + fn paginate_works(works: Vec, variables: &Value) -> Vec { + let offset = variables.get("offset").and_then(Value::as_i64).unwrap_or(0); + let limit = variables + .get("limit") + .and_then(Value::as_i64) + .unwrap_or(PAGE_LIMIT); + works + .into_iter() + .skip(offset.max(0) as usize) + .take(limit.max(0) as usize) + .collect() + } + + fn mock_graphql_state(mut works: Vec) -> MockGraphqlState { + works.sort_by(|left, right| { + let left = left + .get("updatedAtWithRelations") + .and_then(Value::as_str) + .unwrap_or_default(); + let right = right + .get("updatedAtWithRelations") + .and_then(Value::as_str) + .unwrap_or_default(); + right.cmp(left) + }); + let latest = works + .first() + .and_then(|work| work.get("updatedAtWithRelations")) + .and_then(Value::as_str) + .unwrap_or("2024-12-31T00:00:00Z") + .to_string(); + let earliest = works + .last() + .and_then(|work| work.get("updatedAtWithRelations")) + .and_then(Value::as_str) + .unwrap_or("2024-01-01T00:00:00Z") + .to_string(); + + MockGraphqlState { + works, + publishers: vec![json!({ + "publisherId": PUBLISHER_ID, + "publisherName": PUBLISHER_NAME, + })], + latest, + earliest, + } + } + + fn make_work( + work_id: Uuid, + updated_at_with_relations: &str, + publisher_name: &str, + marc_eligible: bool, + include_xml_publication: bool, + ) -> Value { + let contributions = if marc_eligible { + vec![json!({ + "contributionType": "AUTHOR", + "firstName": "Ada", + "lastName": "Lovelace", + "fullName": "Ada Lovelace", + "mainContribution": true, + "biographies": [], + "contributionOrdinal": 1, + "contributor": { + "orcid": "https://orcid.org/0000-0002-0000-0001", + "website": null + }, + "affiliations": [] + })] + } else { + Vec::new() + }; + let languages = if marc_eligible { + vec![json!({ + "languageCode": "ENG", + "languageRelation": "ORIGINAL" + })] + } else { + Vec::new() + }; + let mut publications = vec![json!({ + "publicationId": Uuid::from_u128(work_id.as_u128() + 10), + "publicationType": "PDF", + "isbn": if marc_eligible { Value::String("978-1-4028-9462-6".to_string()) } else { Value::Null }, + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [ + { + "landingPage": "https://example.org/book", + "fullTextUrl": "https://example.org/book.pdf", + "locationPlatform": "OTHER", + "canonical": true + } + ] + })]; + if include_xml_publication { + publications.push(json!({ + "publicationId": Uuid::from_u128(work_id.as_u128() + 11), + "publicationType": "XML", + "isbn": "978-92-95055-02-5", + "weightG": null, + "weightOz": null, + "widthMm": null, + "widthCm": null, + "widthIn": null, + "heightMm": null, + "heightCm": null, + "heightIn": null, + "depthMm": null, + "depthCm": null, + "depthIn": null, + "accessibilityStandard": null, + "accessibilityAdditionalStandard": null, + "accessibilityException": null, + "accessibilityReportUrl": null, + "prices": [], + "locations": [] + })); + } + + let mut work = serde_json::Map::new(); + work.insert("workId".to_string(), json!(work_id)); + work.insert( + "updatedAtWithRelations".to_string(), + json!(updated_at_with_relations), + ); + work.insert("workStatus".to_string(), json!("ACTIVE")); + work.insert("workType".to_string(), json!("MONOGRAPH")); + work.insert("reference".to_string(), Value::Null); + work.insert("edition".to_string(), json!(1)); + work.insert( + "doi".to_string(), + json!(format!("https://doi.org/10.00001/{work_id}")), + ); + work.insert("publicationDate".to_string(), json!("2024-01-01")); + work.insert("withdrawnDate".to_string(), Value::Null); + work.insert( + "license".to_string(), + json!("http://creativecommons.org/licenses/by/4.0/"), + ); + work.insert("copyrightHolder".to_string(), json!("Author")); + work.insert("generalNote".to_string(), Value::Null); + work.insert("bibliographyNote".to_string(), Value::Null); + work.insert("place".to_string(), json!("London")); + work.insert("pageCount".to_string(), json!(100)); + work.insert("pageBreakdown".to_string(), Value::Null); + work.insert("firstPage".to_string(), Value::Null); + work.insert("lastPage".to_string(), Value::Null); + work.insert("pageInterval".to_string(), Value::Null); + work.insert("imageCount".to_string(), Value::Null); + work.insert("tableCount".to_string(), Value::Null); + work.insert("audioCount".to_string(), Value::Null); + work.insert("videoCount".to_string(), Value::Null); + work.insert("landingPage".to_string(), json!("https://example.org/book")); + work.insert("toc".to_string(), Value::Null); + work.insert("lccn".to_string(), Value::Null); + work.insert("oclc".to_string(), Value::Null); + work.insert("coverUrl".to_string(), Value::Null); + work.insert("coverCaption".to_string(), Value::Null); + work.insert( + "titles".to_string(), + json!([{ + "titleId": Uuid::from_u128(work_id.as_u128() + 1), + "localeCode": "EN", + "fullTitle": "Sample Title", + "title": "Sample Title", + "subtitle": null, + "canonical": true + }]), + ); + work.insert("abstracts".to_string(), json!([])); + work.insert( + "imprint".to_string(), + json!({ + "imprintName": "Imprint", + "imprintUrl": null, + "crossmarkDoi": null, + "defaultCurrency": "EUR", + "defaultPlace": "London", + "defaultLocale": "EN", + "publisher": { + "publisherName": publisher_name, + "publisherShortname": "OAP", + "publisherUrl": null, + "accessibilityStatement": null, + "contacts": [] + } + }), + ); + work.insert("issues".to_string(), json!([])); + work.insert("contributions".to_string(), json!(contributions)); + work.insert("languages".to_string(), json!(languages)); + work.insert("publications".to_string(), json!(publications)); + work.insert("subjects".to_string(), json!([])); + work.insert("fundings".to_string(), json!([])); + work.insert("relations".to_string(), json!([])); + work.insert("references".to_string(), json!([])); + Value::Object(work) + } + + fn make_descending_work_series(count: usize) -> Vec { + let base_date = NaiveDate::from_ymd_opt(2024, 12, 31).expect("valid base date"); + (0..count) + .map(|index| { + let updated_at = (base_date - Duration::days(index as i64)) + .format("%Y-%m-%dT12:00:00Z") + .to_string(); + let work_id = + Uuid::from_u128(0x1000_0000_0000_0000_0000_0000_0000_0000 + index as u128); + make_work(work_id, &updated_at, PUBLISHER_NAME, true, true) + }) + .collect() + } + + fn normalize_response_date(xml: &str) -> String { + let open = ""; + let close = ""; + let Some(start) = xml.find(open) else { + return xml.to_string(); + }; + let value_start = start + open.len(); + let Some(value_end_rel) = xml[value_start..].find(close) else { + return xml.to_string(); + }; + let value_end = value_start + value_end_rel; + let mut normalized = String::new(); + normalized.push_str(&xml[..value_start]); + normalized.push_str("RESPONSE_DATE"); + normalized.push_str(&xml[value_end..]); + normalized + } + + fn request_opening_tag(xml: &str) -> String { + let start = xml.find("') + .map(|offset| start + offset) + .expect("request closing bracket"); + xml[start..=end].to_string() + } + + fn extract_resumption_token(xml: &str) -> Option { + let token_start = xml.find("')? + 1; + let content_end = content_start + xml[content_start..].find("")?; + let value = xml[content_start..content_end].trim(); + if value.is_empty() { + None + } else { + Some(value.to_string()) + } + } + + fn count_occurrences(haystack: &str, needle: &str) -> usize { + haystack.matches(needle).count() + } + + #[actix_web::test] + async fn get_and_post_are_equivalent_for_all_oai_verbs_on_root_endpoint() { + let work_id = Uuid::from_u128(1); + let works = vec![make_work( + work_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service( + web::resource("/") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ), + ) + .await; + + let identifier = OaiService::oai_identifier(work_id); + let cases = vec![ + "verb=Identify".to_string(), + "verb=ListMetadataFormats".to_string(), + "verb=ListSets".to_string(), + format!("verb=GetRecord&identifier={identifier}&metadataPrefix=oai_dc"), + "verb=ListIdentifiers&metadataPrefix=oai_dc".to_string(), + "verb=ListRecords&metadataPrefix=oai_dc".to_string(), + ]; + + for case in cases { + let get_req = test::TestRequest::get() + .uri(&format!("/?{case}")) + .to_request(); + let get_response = test::call_service(&app, get_req).await; + assert_eq!(get_response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + get_response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .expect("GET content type"), + "text/xml; charset=utf-8" + ); + assert!(get_response + .headers() + .get(header::CONTENT_ENCODING) + .is_none()); + let get_body = String::from_utf8(test::read_body(get_response).await.to_vec()) + .expect("GET body UTF-8"); + + let post_req = test::TestRequest::post() + .uri("/") + .insert_header((header::CONTENT_TYPE, "application/x-www-form-urlencoded")) + .set_payload(case.clone()) + .to_request(); + let post_response = test::call_service(&app, post_req).await; + assert_eq!(post_response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + post_response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .expect("POST content type"), + "text/xml; charset=utf-8" + ); + assert!(post_response + .headers() + .get(header::CONTENT_ENCODING) + .is_none()); + let post_body = String::from_utf8(test::read_body(post_response).await.to_vec()) + .expect("POST body UTF-8"); + + if case == "verb=Identify" { + assert!(get_body.contains("gzip")); + assert!(post_body.contains("gzip")); + assert!(get_body.contains( + "Metadata is licensed under the terms of Creative Commons CC0 1.0 Universal: https://creativecommons.org/publicdomain/zero/1.0/." + )); + assert!(post_body.contains( + "https://creativecommons.org/publicdomain/zero/1.0/" + )); + } + + assert_eq!( + normalize_response_date(&get_body), + normalize_response_date(&post_body) + ); + } + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn stylesheet_contains_branding_and_oai_rendering_support() { + let app = test::init_service( + App::new().service(web::resource("/oai2.xsl").route(web::get().to(stylesheet))), + ) + .await; + + let req = test::TestRequest::get().uri("/oai2.xsl").to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()), + Some("text/xsl; charset=utf-8") + ); + + let body = String::from_utf8(test::read_body(response).await.to_vec()) + .expect("stylesheet body UTF-8"); + assert!(body.contains("https://cdn.thoth.pub/THOTH_ColourPos.png")); + assert!(body.contains( + "https://cdn.thoth.pub/favicons/thoth-head-20260331/transparent/favicon.ico" + )); + assert!(body.contains( + "https://cdn.thoth.pub/favicons/thoth-head-20260331/transparent/manifest.json" + )); + assert!(body.contains("Rights Management")); + assert!(body.contains("match=\"oai:setDescription\"")); + assert!(body.contains("match=\"oai:about\"")); + assert!(body.contains("End of list. This empty token marks a terminal page.")); + } + + #[actix_web::test] + async fn not_found_page_contains_favicon_and_oai_link() { + let app = test::init_service(App::new().default_service(web::route().to(not_found))).await; + + let req = test::TestRequest::get().uri("/missing").to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::NOT_FOUND); + assert_eq!( + response + .headers() + .get(header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()), + Some("text/html; charset=utf-8") + ); + + let body = + String::from_utf8(test::read_body(response).await.to_vec()).expect("404 body UTF-8"); + assert!(body.contains( + "https://cdn.thoth.pub/favicons/thoth-head-20260331/transparent/favicon.ico" + )); + assert!(body.contains( + "https://cdn.thoth.pub/favicons/thoth-head-20260331/transparent/manifest.json" + )); + assert!(body.contains("OAI-PMH Interface")); + } + + #[actix_web::test] + async fn repeated_arguments_return_bad_argument() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service( + web::resource("/oai") + .route(web::get().to(oai_get)) + .route(web::post().to(oai_post)), + ), + ) + .await; + + let get_req = test::TestRequest::get() + .uri("/oai?verb=Identify&verb=ListSets") + .to_request(); + let get_response = test::call_service(&app, get_req).await; + let get_body = String::from_utf8(test::read_body(get_response).await.to_vec()) + .expect("GET body UTF-8"); + assert!(get_body.contains("")); + + let post_req = test::TestRequest::post() + .uri("/oai?verb=Identify") + .insert_header((header::CONTENT_TYPE, "application/x-www-form-urlencoded")) + .set_payload("verb=ListSets") + .to_request(); + let post_response = test::call_service(&app, post_req).await; + let post_body = String::from_utf8(test::read_body(post_response).await.to_vec()) + .expect("POST body UTF-8"); + assert!(post_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn request_attributes_are_omitted_for_bad_verb_and_bad_argument() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let bad_verb_req = test::TestRequest::get() + .uri("/oai?verb=UnknownVerb") + .to_request(); + let bad_verb_response = test::call_service(&app, bad_verb_req).await; + let bad_verb_body = String::from_utf8(test::read_body(bad_verb_response).await.to_vec()) + .expect("badVerb body UTF-8"); + assert!(bad_verb_body.contains("")); + assert_eq!(request_opening_tag(&bad_verb_body), ""); + + let bad_argument_req = test::TestRequest::get() + .uri("/oai?verb=Identify&foo=bar") + .to_request(); + let bad_argument_response = test::call_service(&app, bad_argument_req).await; + let bad_argument_body = + String::from_utf8(test::read_body(bad_argument_response).await.to_vec()) + .expect("badArgument body UTF-8"); + assert!(bad_argument_body.contains("")); + assert_eq!(request_opening_tag(&bad_argument_body), ""); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_sets_rejects_resumption_tokens() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(1))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListSets&resumptionToken=abc") + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_metadata_formats_is_identifier_aware() { + let marc_eligible_id = Uuid::from_u128(10); + let marc_ineligible_id = Uuid::from_u128(11); + let works = vec![ + make_work( + marc_eligible_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + marc_ineligible_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + false, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let mut export_state = MockExportState::default(); + export_state + .marc_non_disseminatable_work_ids + .insert(marc_ineligible_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let eligible_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListMetadataFormats&identifier={}", + OaiService::oai_identifier(marc_eligible_id) + )) + .to_request(); + let eligible_response = test::call_service(&app, eligible_req).await; + let eligible_body = String::from_utf8(test::read_body(eligible_response).await.to_vec()) + .expect("eligible body UTF-8"); + assert!(eligible_body.contains("oai_dc")); + assert!(eligible_body.contains("oai_openaire")); + assert!(eligible_body.contains("marcxml")); + + let ineligible_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListMetadataFormats&identifier={}", + OaiService::oai_identifier(marc_ineligible_id) + )) + .to_request(); + let ineligible_response = test::call_service(&app, ineligible_req).await; + let ineligible_body = + String::from_utf8(test::read_body(ineligible_response).await.to_vec()) + .expect("ineligible body UTF-8"); + assert!(ineligible_body.contains("oai_dc")); + assert!(ineligible_body.contains("oai_openaire")); + assert!(!ineligible_body.contains("marcxml")); + + let invalid_identifier_req = test::TestRequest::get() + .uri( + "/oai?verb=ListMetadataFormats&identifier=oai:example.org:00000000-0000-0000-0000-000000000001", + ) + .to_request(); + let invalid_identifier_response = test::call_service(&app, invalid_identifier_req).await; + let invalid_identifier_body = + String::from_utf8(test::read_body(invalid_identifier_response).await.to_vec()) + .expect("invalid identifier body UTF-8"); + assert!(invalid_identifier_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_metadata_formats_uses_export_dissemination_for_oai_prefixes() { + let work_id = Uuid::from_u128(12); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let mut export_state = MockExportState::default(); + export_state.non_disseminatable_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListMetadataFormats&identifier={}", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn marc_export_parse_failures_are_mapped_to_cannot_disseminate_format() { + let work_id = Uuid::from_u128(20); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.malformed_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=GetRecord&identifier={}&metadataPrefix=marcxml", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn oai_dc_export_failures_are_mapped_to_cannot_disseminate_format() { + let work_id = Uuid::from_u128(22); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.non_disseminatable_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=GetRecord&identifier={}&metadataPrefix=oai_dc", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + assert!(body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_identifiers_validates_datestamp_arguments() { + let graphql_server = + spawn_graphql_server(mock_graphql_state(make_descending_work_series(3))).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let mismatch_req = test::TestRequest::get() + .uri( + "/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01&until=2024-01-01T00:00:00Z", + ) + .to_request(); + let mismatch_response = test::call_service(&app, mismatch_req).await; + let mismatch_body = String::from_utf8(test::read_body(mismatch_response).await.to_vec()) + .expect("mismatch body UTF-8"); + assert!(mismatch_body.contains("")); + + let invalid_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=20240101") + .to_request(); + let invalid_response = test::call_service(&app, invalid_req).await; + let invalid_body = String::from_utf8(test::read_body(invalid_response).await.to_vec()) + .expect("invalid body UTF-8"); + assert!(invalid_body.contains("")); + + let reversed_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-12-31&until=2024-01-01") + .to_request(); + let reversed_response = test::call_service(&app, reversed_req).await; + let reversed_body = String::from_utf8(test::read_body(reversed_response).await.to_vec()) + .expect("reversed body UTF-8"); + assert!(reversed_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn list_identifiers_applies_date_filters_and_reports_no_records_match() { + let works = vec![ + make_work( + Uuid::from_u128(30), + "2024-03-01T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + Uuid::from_u128(31), + "2024-02-01T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + Uuid::from_u128(32), + "2023-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let from_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01") + .to_request(); + let from_response = test::call_service(&app, from_req).await; + let from_body = String::from_utf8(test::read_body(from_response).await.to_vec()) + .expect("from body UTF-8"); + assert_eq!(count_occurrences(&from_body, "
"), 2); + + let until_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&until=2024-01-31") + .to_request(); + let until_response = test::call_service(&app, until_req).await; + let until_body = String::from_utf8(test::read_body(until_response).await.to_vec()) + .expect("until body UTF-8"); + assert_eq!(count_occurrences(&until_body, "
"), 1); + + let range_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-01-01&until=2024-02-15") + .to_request(); + let range_response = test::call_service(&app, range_req).await; + let range_body = String::from_utf8(test::read_body(range_response).await.to_vec()) + .expect("range body UTF-8"); + assert_eq!(count_occurrences(&range_body, "
"), 1); + + let no_match_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2030-01-01") + .to_request(); + let no_match_response = test::call_service(&app, no_match_req).await; + let no_match_body = String::from_utf8(test::read_body(no_match_response).await.to_vec()) + .expect("no match body UTF-8"); + assert!(no_match_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn resumption_tokens_support_filters_backward_compatibility_and_terminal_token() { + let works = make_descending_work_series(60); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let filtered_first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-11-10&until=2024-12-31") + .to_request(); + let filtered_first_response = test::call_service(&app, filtered_first_req).await; + let filtered_first_body = + String::from_utf8(test::read_body(filtered_first_response).await.to_vec()) + .expect("first filtered body UTF-8"); + assert_eq!(count_occurrences(&filtered_first_body, "
"), 50); + assert!(filtered_first_body.contains("")); + assert!(!filtered_first_body.contains("completeListSize=\"")); + + let filtered_token = + extract_resumption_token(&filtered_first_body).expect("filtered resumption token"); + let decoded_filtered = + decode_resumption_token(&filtered_token).expect("decode filtered resumption token"); + assert_eq!(decoded_filtered.from.as_deref(), Some("2024-11-10")); + assert_eq!(decoded_filtered.until.as_deref(), Some("2024-12-31")); + assert_eq!( + decoded_filtered.granularity, + Some(DatestampGranularity::Day) + ); + assert!(decoded_filtered.scan_offset.is_some()); + assert_eq!(decoded_filtered.returned_count, Some(50)); + + let filtered_second_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={filtered_token}" + )) + .to_request(); + let filtered_second_response = test::call_service(&app, filtered_second_req).await; + let filtered_second_body = + String::from_utf8(test::read_body(filtered_second_response).await.to_vec()) + .expect("second filtered body UTF-8"); + assert_eq!(count_occurrences(&filtered_second_body, "
"), 2); + assert!(filtered_second_body.contains("")); + + let unfiltered_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc") + .to_request(); + let unfiltered_response = test::call_service(&app, unfiltered_req).await; + let unfiltered_body = + String::from_utf8(test::read_body(unfiltered_response).await.to_vec()) + .expect("unfiltered body UTF-8"); + assert!(unfiltered_body.contains("completeListSize=\"60\"")); + + let legacy_token = URL_SAFE_NO_PAD.encode( + serde_json::to_vec(&json!({ + "offset": 0, + "metadata_prefix": "OaiDc", + "set": null, + "identifiers_only": true + })) + .expect("legacy token serialize"), + ); + let legacy_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={legacy_token}" + )) + .to_request(); + let legacy_response = test::call_service(&app, legacy_req).await; + let legacy_body = String::from_utf8(test::read_body(legacy_response).await.to_vec()) + .expect("legacy response body UTF-8"); + assert!(legacy_body.contains("")); + + let malformed_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&resumptionToken=not-a-token") + .to_request(); + let malformed_response = test::call_service(&app, malformed_req).await; + let malformed_body = String::from_utf8(test::read_body(malformed_response).await.to_vec()) + .expect("malformed response body UTF-8"); + assert!(malformed_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn filtered_resumption_cursor_tracks_returned_records() { + let works = make_descending_work_series(120); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-09-13&until=2024-12-31") + .to_request(); + let first_response = test::call_service(&app, first_req).await; + let first_body = String::from_utf8(test::read_body(first_response).await.to_vec()) + .expect("first page UTF-8"); + assert!(first_body.contains("")); + let first_token = extract_resumption_token(&first_body).expect("first token"); + + let second_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={first_token}" + )) + .to_request(); + let second_response = test::call_service(&app, second_req).await; + let second_body = String::from_utf8(test::read_body(second_response).await.to_vec()) + .expect("second page UTF-8"); + assert_eq!(count_occurrences(&second_body, "
"), 50); + assert!(second_body.contains("")); + let second_token = extract_resumption_token(&second_body).expect("second token"); + let decoded_second = decode_resumption_token(&second_token).unwrap(); + assert_eq!(decoded_second.returned_count, Some(100)); + + let third_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={second_token}" + )) + .to_request(); + let third_response = test::call_service(&app, third_req).await; + let third_body = String::from_utf8(test::read_body(third_response).await.to_vec()) + .expect("third page UTF-8"); + assert_eq!(count_occurrences(&third_body, "
"), 10); + assert!(third_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn continuation_end_returns_terminal_token_without_no_records_match() { + let works = make_descending_work_series(120); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc&from=2024-11-12&until=2024-12-31") + .to_request(); + let first_response = test::call_service(&app, first_req).await; + let first_body = String::from_utf8(test::read_body(first_response).await.to_vec()) + .expect("first page UTF-8"); + assert_eq!(count_occurrences(&first_body, "
"), 50); + assert!(!first_body.contains("")); + assert!(continuation_body.contains("")); + assert!(!continuation_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn gzip_accept_encoding_returns_compressed_oai_xml() { + let works = make_descending_work_series(1); + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .insert_header((header::ACCEPT_ENCODING, "gzip")) + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::OK); + assert_eq!( + response + .headers() + .get(header::CONTENT_ENCODING) + .and_then(|value| value.to_str().ok()), + Some("gzip") + ); + + let compressed = test::read_body(response).await; + let mut decoder = GzDecoder::new(compressed.as_ref()); + let mut xml = String::new(); + decoder + .read_to_string(&mut xml) + .expect("gzip decode response"); + assert!(xml.contains("")); + assert!(xml.contains("gzip")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn transient_graphql_failures_return_503_with_retry_after() { + let graphql_server = + spawn_graphql_error_server(actix_web::http::StatusCode::SERVICE_UNAVAILABLE).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: 45, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::SERVICE_UNAVAILABLE + ); + assert_eq!( + response + .headers() + .get(header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()), + Some("45") + ); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn non_transient_graphql_failures_remain_http_500() { + let graphql_server = + spawn_graphql_error_server(actix_web::http::StatusCode::BAD_REQUEST).await; + let export_server = spawn_export_server(MockExportState::default()).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=Identify") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::INTERNAL_SERVER_ERROR + ); + assert!(response.headers().get(header::RETRY_AFTER).is_none()); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn transient_export_failures_return_503_with_retry_after() { + let work_id = Uuid::from_u128(21); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.failing_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=GetRecord&identifier={}&metadataPrefix=marcxml", + OaiService::oai_identifier(work_id) + )) + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::SERVICE_UNAVAILABLE + ); + assert_eq!( + response + .headers() + .get(header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()), + Some("30") + ); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn marcxml_list_filters_use_export_dissemination_truth() { + let disseminatable_work_id = Uuid::from_u128(31); + let non_disseminatable_work_id = Uuid::from_u128(32); + let second_disseminatable_work_id = Uuid::from_u128(33); + let works = vec![ + make_work( + disseminatable_work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + non_disseminatable_work_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + second_disseminatable_work_id, + "2024-12-29T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state + .marc_non_disseminatable_work_ids + .insert(non_disseminatable_work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=marcxml") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!(response.status(), actix_web::http::StatusCode::OK); + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + + assert_eq!(count_occurrences(&body, "
"), 2); + assert!(body.contains(&OaiService::oai_identifier(disseminatable_work_id))); + assert!(body.contains(&OaiService::oai_identifier(second_disseminatable_work_id))); + assert!(!body.contains(&OaiService::oai_identifier(non_disseminatable_work_id))); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn oai_dc_list_filters_use_export_dissemination_truth() { + let visible_work_id = Uuid::from_u128(37); + let hidden_work_id = Uuid::from_u128(38); + let works = vec![ + make_work( + visible_work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + hidden_work_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state + .non_disseminatable_work_ids + .insert(hidden_work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=oai_dc") + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + + assert_eq!(count_occurrences(&body, "
"), 1); + assert!(body.contains(&OaiService::oai_identifier(visible_work_id))); + assert!(!body.contains(&OaiService::oai_identifier(hidden_work_id))); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn marcxml_list_records_excludes_non_disseminatable_records() { + let visible_work_id = Uuid::from_u128(35); + let hidden_work_id = Uuid::from_u128(36); + let works = vec![ + make_work( + visible_work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + make_work( + hidden_work_id, + "2024-12-30T12:00:00Z", + PUBLISHER_NAME, + true, + true, + ), + ]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state + .marc_non_disseminatable_work_ids + .insert(hidden_work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListRecords&metadataPrefix=marcxml") + .to_request(); + let response = test::call_service(&app, req).await; + let body = String::from_utf8(test::read_body(response).await.to_vec()).expect("body UTF-8"); + + assert_eq!(count_occurrences(&body, "
"), 1); + assert!(body.contains(&OaiService::oai_identifier(visible_work_id))); + assert!(!body.contains(&OaiService::oai_identifier(hidden_work_id))); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn marcxml_list_resumption_respects_export_dissemination_filtering() { + let works = make_descending_work_series(80); + let graphql_server = spawn_graphql_server(mock_graphql_state(works.clone())).await; + + let mut export_state = MockExportState::default(); + for (index, work) in works.iter().enumerate() { + let work_id = work + .get("workId") + .and_then(Value::as_str) + .and_then(|value| Uuid::parse_str(value).ok()) + .expect("work id"); + if index % 4 == 0 { + export_state + .marc_non_disseminatable_work_ids + .insert(work_id); + } + } + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let first_req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=marcxml") + .to_request(); + let first_response = test::call_service(&app, first_req).await; + let first_body = String::from_utf8(test::read_body(first_response).await.to_vec()) + .expect("first page UTF-8"); + assert_eq!(count_occurrences(&first_body, "
"), 50); + assert!(first_body.contains("")); + let first_token = extract_resumption_token(&first_body).expect("resumption token"); + + let second_req = test::TestRequest::get() + .uri(&format!( + "/oai?verb=ListIdentifiers&resumptionToken={first_token}" + )) + .to_request(); + let second_response = test::call_service(&app, second_req).await; + let second_body = String::from_utf8(test::read_body(second_response).await.to_vec()) + .expect("second page UTF-8"); + assert_eq!(count_occurrences(&second_body, "
"), 10); + assert!(second_body.contains("")); + + export_server.stop().await; + graphql_server.stop().await; + } + + #[actix_web::test] + async fn transient_export_failures_in_marcxml_lists_return_503_with_retry_after() { + let work_id = Uuid::from_u128(34); + let works = vec![make_work( + work_id, + "2024-12-31T12:00:00Z", + PUBLISHER_NAME, + true, + true, + )]; + let graphql_server = spawn_graphql_server(mock_graphql_state(works)).await; + + let mut export_state = MockExportState::default(); + export_state.failing_work_ids.insert(work_id); + let export_server = spawn_export_server(export_state).await; + + let app = test::init_service( + App::new() + .app_data(web::Data::new(AppState { + service: OaiService::new( + "https://example.org".to_string(), + format!("{}/graphql", graphql_server.base_url), + export_server.base_url.clone(), + ), + retry_after_seconds: DEFAULT_RETRY_AFTER_SECONDS, + })) + .service(web::resource("/oai").route(web::get().to(oai_get))), + ) + .await; + + let req = test::TestRequest::get() + .uri("/oai?verb=ListIdentifiers&metadataPrefix=marcxml") + .to_request(); + let response = test::call_service(&app, req).await; + assert_eq!( + response.status(), + actix_web::http::StatusCode::SERVICE_UNAVAILABLE + ); + assert_eq!( + response + .headers() + .get(header::RETRY_AFTER) + .and_then(|value| value.to_str().ok()), + Some("30") + ); + + export_server.stop().await; + graphql_server.stop().await; + } +} diff --git a/thoth-oai-server/src/service.rs b/thoth-oai-server/src/service.rs new file mode 100644 index 000000000..b5b9df7fe --- /dev/null +++ b/thoth-oai-server/src/service.rs @@ -0,0 +1,536 @@ +use std::{ + collections::{HashMap, VecDeque}, + sync::{Arc, Mutex}, +}; + +use oai_pmh::core::MetadataPrefix; +use quick_xml::{events::Event, Reader, Writer}; +use reqwest::Client; +use thoth_api::model::Timestamp; +use thoth_client::{Publisher, QueryParameters, ThothClient, Work}; +use thoth_errors::{ThothError, ThothResult}; +use uuid::Uuid; + +pub(crate) const RECORD_PREFIX: &str = "oai:thoth.pub"; +pub(crate) const REPOSITORY_NAME: &str = "Thoth OAI-PMH Repository"; +pub(crate) const ADMIN_EMAIL: &str = "support@thoth.pub"; +pub(crate) const SAMPLE_ID: &str = "5a08ff03-7d53-42a9-bfb5-7fc81c099c52"; +#[cfg(test)] +pub(crate) const PAGE_LIMIT: i64 = 50; + +const OAI_DC_SPEC: &str = "dublin_core::thoth"; +const OAI_OPENAIRE_SPEC: &str = "openaire::thoth"; +const MARCXML_SPEC: &str = "marc21xml::thoth"; +const DELEGATED_RECORD_CACHE_LIMIT: usize = 2048; +type DelegatedRecordCacheKey = (Uuid, &'static str); + +#[derive(Default)] +struct DelegatedRecordCache { + entries: HashMap, + insertion_order: VecDeque, +} + +impl DelegatedRecordCache { + fn get(&self, key: &DelegatedRecordCacheKey) -> Option { + self.entries.get(key).cloned() + } + + fn insert(&mut self, key: DelegatedRecordCacheKey, value: String) { + match self.entries.entry(key) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.insert(value); + return; + } + std::collections::hash_map::Entry::Vacant(entry) => { + self.insertion_order.push_back(*entry.key()); + entry.insert(value); + } + } + + while self.entries.len() > DELEGATED_RECORD_CACHE_LIMIT { + if let Some(oldest_key) = self.insertion_order.pop_front() { + self.entries.remove(&oldest_key); + } else { + break; + } + } + } +} + +#[derive(Clone)] +pub(crate) struct OaiService { + public_url: String, + export_url: String, + thoth_client: Arc, + export_client: Client, + delegated_record_cache: Arc>, +} + +#[derive(Debug, Clone)] +pub(crate) struct SetRecord { + pub publisher_id: Uuid, + pub spec: String, + pub name: String, +} + +impl OaiService { + pub(crate) fn new(public_url: String, gql_endpoint: String, export_url: String) -> Self { + Self { + public_url, + export_url, + thoth_client: Arc::new(ThothClient::new(gql_endpoint)), + export_client: Client::new(), + delegated_record_cache: Arc::new(Mutex::new(DelegatedRecordCache::default())), + } + } + + pub(crate) fn repository_url(&self) -> String { + self.public_url.trim_end_matches('/').to_string() + } + + pub(crate) async fn earliest(&self) -> ThothResult { + self.thoth_client.get_oai_earliest_works_updated().await + } + + pub(crate) async fn latest(&self) -> ThothResult { + self.thoth_client.get_oai_latest_works_updated().await + } + + pub(crate) async fn list_sets(&self) -> ThothResult> { + let publishers = self.thoth_client.get_publishers().await?; + Ok(publishers.into_iter().map(Self::to_set_record).collect()) + } + + pub(crate) async fn get_record( + &self, + identifier: Uuid, + _metadata_prefix: MetadataPrefix, + ) -> ThothResult { + self.thoth_client + .get_work(identifier, Self::query_parameters()) + .await + } + + pub(crate) async fn list_source_count( + &self, + metadata_prefix: MetadataPrefix, + set_spec: Option<&str>, + ) -> ThothResult { + let publishers = self.publishers_for_set(set_spec).await?; + if metadata_prefix == MetadataPrefix::MarcXml { + self.thoth_client.get_oai_book_count(publishers).await + } else { + self.thoth_client.get_oai_work_count(publishers).await + } + } + + pub(crate) async fn list_source_batch( + &self, + metadata_prefix: MetadataPrefix, + set_spec: Option<&str>, + offset: i64, + limit: i64, + ) -> ThothResult> { + let publishers = self.publishers_for_set(set_spec).await?; + if metadata_prefix == MetadataPrefix::MarcXml { + self.thoth_client + .get_oai_books(publishers, limit, offset, Self::query_parameters()) + .await + } else { + self.thoth_client + .get_oai_works(publishers, limit, offset, Self::query_parameters()) + .await + } + } + + pub(crate) async fn get_marcxml_record(&self, work_id: Uuid) -> ThothResult { + self.get_delegated_record( + work_id, + MetadataPrefix::MarcXml, + MARCXML_SPEC, + b"record", + "MARCXML", + ) + .await + } + + pub(crate) async fn get_oai_dc_record(&self, work_id: Uuid) -> ThothResult { + self.get_delegated_record( + work_id, + MetadataPrefix::OaiDc, + OAI_DC_SPEC, + b"dc", + "Dublin Core", + ) + .await + } + + pub(crate) async fn get_oai_openaire_record(&self, work_id: Uuid) -> ThothResult { + self.get_delegated_record( + work_id, + MetadataPrefix::OaiOpenaire, + OAI_OPENAIRE_SPEC, + b"resource", + "OpenAIRE", + ) + .await + } + + pub(crate) async fn has_metadata_dissemination( + &self, + work_id: Uuid, + metadata_prefix: MetadataPrefix, + ) -> ThothResult { + let dissemination = match metadata_prefix { + MetadataPrefix::OaiDc => self.get_oai_dc_record(work_id).await, + MetadataPrefix::OaiOpenaire => self.get_oai_openaire_record(work_id).await, + MetadataPrefix::MarcXml => self.get_marcxml_record(work_id).await, + }; + + match dissemination { + Ok(_) => Ok(true), + Err(error) if Self::is_transient_export_error(&error) => Err(error), + Err(_) => Ok(false), + } + } + + async fn get_delegated_record( + &self, + work_id: Uuid, + metadata_prefix: MetadataPrefix, + specification: &str, + element_local_name: &[u8], + format_name: &str, + ) -> ThothResult { + let cache_key = (work_id, metadata_prefix.as_str()); + if let Some(record) = self.get_cached_delegated_record(&cache_key) { + return Ok(record); + } + + let response = self + .export_client + .get(format!( + "{}/specifications/{}/work/{}", + self.export_url.trim_end_matches('/'), + specification, + work_id + )) + .send() + .await + .map_err(|error| ThothError::RequestError(error.to_string()))?; + + let status = response.status(); + let body = response + .text() + .await + .map_err(|error| ThothError::RequestError(error.to_string()))?; + if !status.is_success() { + return Err(ThothError::RequestError(format!( + "Export {}: {}", + status.as_u16(), + body + ))); + } + + let record = Self::extract_xml_element(&body, element_local_name, format_name)?; + self.cache_delegated_record(cache_key, record.clone()); + Ok(record) + } + + fn get_cached_delegated_record(&self, key: &DelegatedRecordCacheKey) -> Option { + self.delegated_record_cache + .lock() + .ok() + .and_then(|cache| cache.get(key)) + } + + fn cache_delegated_record(&self, key: DelegatedRecordCacheKey, value: String) { + if let Ok(mut cache) = self.delegated_record_cache.lock() { + cache.insert(key, value); + } + } + + pub(crate) fn oai_identifier(work_id: Uuid) -> String { + format!("{RECORD_PREFIX}:{work_id}") + } + + pub(crate) fn parse_oai_identifier(identifier: &str) -> ThothResult { + identifier + .strip_prefix(&format!("{RECORD_PREFIX}:")) + .ok_or(ThothError::InvalidUuid) + .and_then(|value| Uuid::parse_str(value).map_err(|_| ThothError::InvalidUuid)) + } + + pub(crate) fn timestamp_xml(timestamp: Timestamp) -> String { + timestamp.to_rfc3339().replace("+00:00", "Z") + } + + pub(crate) fn set_spec(publisher_name: &str) -> String { + publisher_name + .chars() + .filter(|ch| ch.is_alphanumeric() || ch.is_whitespace() || *ch == '_') + .collect::() + .to_lowercase() + .split_whitespace() + .collect::>() + .join("-") + } + + pub(crate) fn query_parameters() -> QueryParameters { + QueryParameters::new() + .with_all_abstracts() + .with_all_titles() + .with_issues() + .with_languages() + .with_publications() + .with_subjects() + .with_fundings() + .with_relations() + .with_references() + } + + async fn publishers_for_set(&self, set_spec: Option<&str>) -> ThothResult>> { + let set = self.find_set(set_spec).await?; + Ok(set.map(|set_record| vec![set_record.publisher_id])) + } + + async fn find_set(&self, set_spec: Option<&str>) -> ThothResult> { + let Some(set_spec) = set_spec else { + return Ok(None); + }; + + let sets = self.list_sets().await?; + sets.into_iter() + .find(|set_record| set_record.spec == set_spec) + .map(Some) + .ok_or(ThothError::EntityNotFound) + } + + fn to_set_record(publisher: Publisher) -> SetRecord { + SetRecord { + publisher_id: publisher.publisher_id, + spec: Self::set_spec(&publisher.publisher_name), + name: publisher.publisher_name, + } + } + + fn extract_xml_element( + body: &str, + element_local_name: &[u8], + format_name: &str, + ) -> ThothResult { + let mut reader = Reader::from_str(body); + reader.config_mut().trim_text(false); + let mut writer = Writer::new(Vec::new()); + let mut capture_depth = 0usize; + let mut capturing = false; + + loop { + match reader.read_event() { + Ok(Event::Start(event)) => { + let is_record = event.local_name().as_ref() == element_local_name; + if capturing { + capture_depth += 1; + writer + .write_event(Event::Start(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } else if is_record { + capturing = true; + capture_depth = 1; + writer + .write_event(Event::Start(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::Empty(event)) => { + let is_record = event.local_name().as_ref() == element_local_name; + if capturing || is_record { + writer + .write_event(Event::Empty(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + if is_record && !capturing { + return String::from_utf8(writer.into_inner()).map_err(|_| { + ThothError::InternalError(format!( + "Could not parse {format_name} XML" + )) + }); + } + } + } + Ok(Event::End(event)) => { + if capturing { + writer + .write_event(Event::End(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + capture_depth -= 1; + if capture_depth == 0 { + return String::from_utf8(writer.into_inner()).map_err(|_| { + ThothError::InternalError(format!( + "Could not parse {format_name} XML" + )) + }); + } + } + } + Ok(Event::Text(event)) => { + if capturing { + writer + .write_event(Event::Text(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::CData(event)) => { + if capturing { + writer + .write_event(Event::CData(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::GeneralRef(event)) => { + if capturing { + writer + .write_event(Event::GeneralRef(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::Comment(event)) => { + if capturing { + writer + .write_event(Event::Comment(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::PI(event)) => { + if capturing { + writer + .write_event(Event::PI(event.to_owned())) + .map_err(|error| { + ThothError::InternalError(format!( + "Could not write {format_name}: {error}" + )) + })?; + } + } + Ok(Event::Decl(_)) | Ok(Event::DocType(_)) => {} + Ok(Event::Eof) => { + return Err(ThothError::InternalError(format!( + "No {format_name} element found" + ))); + } + Err(error) => { + return Err(ThothError::InternalError(format!( + "Could not parse {format_name} XML: {error}" + ))); + } + } + } + } + + fn is_transient_export_error(error: &ThothError) -> bool { + let ThothError::RequestError(message) = error else { + return false; + }; + let message = message.to_ascii_lowercase(); + let has_transient_status = [429, 500, 502, 503, 504] + .iter() + .any(|status| message.contains(&format!("export {status}"))); + let has_network_failure = [ + "timed out", + "timeout", + "connection refused", + "connection reset", + "error sending request", + "temporary failure", + "dns error", + "failed to lookup address", + ] + .iter() + .any(|needle| message.contains(needle)); + has_transient_status || has_network_failure + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn set_spec_normalizes_publisher_name() { + assert_eq!( + OaiService::set_spec("Punctum Books, Inc."), + "punctum-books-inc" + ); + assert_eq!( + OaiService::set_spec("Open Access_ Press"), + "open-access_-press" + ); + } + + #[test] + fn oai_identifier_round_trip() { + let work_id = Uuid::parse_str("5a08ff03-7d53-42a9-bfb5-7fc81c099c52").unwrap(); + let identifier = OaiService::oai_identifier(work_id); + + assert_eq!( + OaiService::parse_oai_identifier(&identifier).unwrap(), + work_id + ); + } + + #[test] + fn extract_xml_element_returns_record_element() { + let xml = r#" + + + 00000nam a2200000 i 4500 + 123 + +"#; + let record = OaiService::extract_xml_element(xml, b"record", "MARCXML").unwrap(); + + assert!(record.starts_with("00000nam a2200000 i 4500")); + assert!(record.contains("123")); + assert!(!record.contains(" + + + + + + + + + + diff --git a/thoth-oai-server/tests/fixtures/xsd/oai_openaire.xsd b/thoth-oai-server/tests/fixtures/xsd/oai_openaire.xsd new file mode 100644 index 000000000..f45c430b1 --- /dev/null +++ b/thoth-oai-server/tests/fixtures/xsd/oai_openaire.xsd @@ -0,0 +1,16 @@ + + + + + + + + + + +