From bd1a0421721ee9e7a3b2249241a28185a162d040 Mon Sep 17 00:00:00 2001 From: Joseph Godlewski Date: Wed, 8 Apr 2026 12:20:29 -0700 Subject: [PATCH 1/5] feat: add low-level Storage Buckets API client --- .../plans/2026-04-08-hf-bucket-rust-client.md | 2015 +++++++++++++++++ ...2026-04-08-hf-bucket-rust-client-design.md | 437 ++++ huggingface_hub/src/api/buckets.rs | 538 +++++ huggingface_hub/src/api/mod.rs | 1 + huggingface_hub/src/blocking.rs | 85 + huggingface_hub/src/error.rs | 22 + huggingface_hub/src/lib.rs | 2 +- huggingface_hub/src/repository.rs | 21 + huggingface_hub/src/types/buckets.rs | 291 +++ huggingface_hub/src/types/mod.rs | 2 + huggingface_hub/tests/integration_test.rs | 112 + 11 files changed, 3525 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-04-08-hf-bucket-rust-client.md create mode 100644 docs/specs/2026-04-08-hf-bucket-rust-client-design.md create mode 100644 huggingface_hub/src/api/buckets.rs create mode 100644 huggingface_hub/src/types/buckets.rs diff --git a/docs/plans/2026-04-08-hf-bucket-rust-client.md b/docs/plans/2026-04-08-hf-bucket-rust-client.md new file mode 100644 index 0000000..2da48b1 --- /dev/null +++ b/docs/plans/2026-04-08-hf-bucket-rust-client.md @@ -0,0 +1,2015 @@ +# HFBucket Rust Client Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an `HFBucket` type and supporting infrastructure to `huggingface_hub_rust` that exposes the full HuggingFace Storage Buckets API as a typed async Rust client. + +**Architecture:** `HFBucket` is a standalone handle type (following the `HFSpace` precedent) holding an `HFClient` reference plus namespace and repo strings. Bucket methods are implemented in `api/buckets.rs`; types live in `types/buckets.rs`. A private `check_bucket_response` helper maps HTTP status codes — including four new `HFError` variants — for all bucket endpoints. + +**Tech Stack:** Rust, `reqwest` 0.13, `serde`/`serde_json`, `typed-builder`, `futures` (`try_unfold`), `tokio` + +**Spec:** `docs/specs/2026-04-08-hf-bucket-rust-client-design.md` in `huggingface/xet-catalogue` +**Target repo:** `/Users/jgodlew/git/huggingface/huggingface_hub_rust/` + +--- + +## File Map + +| Action | Path | +|--------|------| +| Create | `huggingface_hub/src/types/buckets.rs` | +| Create | `huggingface_hub/src/api/buckets.rs` | +| Modify | `huggingface_hub/src/error.rs` — add 4 new `HFError` variants | +| Modify | `huggingface_hub/src/types/mod.rs` — add `pub mod buckets; pub use buckets::*;` | +| Modify | `huggingface_hub/src/api/mod.rs` — add `pub mod buckets;` | +| Modify | `huggingface_hub/src/repository.rs` — add `HFBucket` struct | +| Modify | `huggingface_hub/src/client.rs` — add `bucket()`, `create_bucket()`, `list_buckets()` | +| Modify | `huggingface_hub/src/lib.rs` — export `HFBucket` and `HFBucketSync` | +| Modify | `huggingface_hub/src/blocking.rs` — add `HFBucketSync` and blocking wrappers | +| Modify | `huggingface_hub/tests/integration_test.rs` — add integration tests | + +--- + +## Task 1: Add new `HFError` variants + +**Files:** +- Modify: `huggingface_hub/src/error.rs` + +- [ ] **Step 1: Write a failing test that matches on the new variants** + +Add to the bottom of `huggingface_hub/src/error.rs`: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_error_variants_display() { + assert_eq!(HFError::Forbidden.to_string(), "forbidden"); + assert_eq!( + HFError::Conflict("name taken".to_string()).to_string(), + "conflict: name taken" + ); + assert_eq!(HFError::RateLimited.to_string(), "rate limited"); + assert_eq!(HFError::QuotaExceeded.to_string(), "quota exceeded"); + } +} +``` + +- [ ] **Step 2: Run the test to confirm it fails** + +```bash +cd /Users/jgodlew/git/huggingface/huggingface_hub_rust +cargo test -p huggingface_hub new_error_variants_display 2>&1 +``` + +Expected: compile error — `HFError::Forbidden` does not exist. + +- [ ] **Step 3: Add the four variants to `HFError`** + +In `huggingface_hub/src/error.rs`, locate the `HFError` enum and add after the last existing variant (before the closing `}`): + +```rust + #[error("forbidden")] + Forbidden, + #[error("conflict: {0}")] + Conflict(String), + #[error("rate limited")] + RateLimited, + #[error("quota exceeded")] + QuotaExceeded, +``` + +- [ ] **Step 4: Run the test to confirm it passes** + +```bash +cargo test -p huggingface_hub new_error_variants_display 2>&1 +``` + +Expected: `test error::tests::new_error_variants_display ... ok` + +- [ ] **Step 5: Commit** + +```bash +cd /Users/jgodlew/git/huggingface/huggingface_hub_rust +git add huggingface_hub/src/error.rs +git commit -m "feat(error): add Forbidden, Conflict, RateLimited, QuotaExceeded variants" +``` + +--- + +## Task 2: Create `types/buckets.rs` and wire it in + +**Files:** +- Create: `huggingface_hub/src/types/buckets.rs` +- Modify: `huggingface_hub/src/types/mod.rs` + +- [ ] **Step 1: Write a failing test for type deserialization** + +Create `huggingface_hub/src/types/buckets.rs` with only the test module: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bucket_info_deserializes() { + let json = r#"{ + "id": "my-bucket", + "name": "my-bucket", + "namespace": "myuser", + "private": false, + "usedStorage": 1024, + "totalFiles": 3, + "cdn": [], + "region": "us-east-1" + }"#; + let info: BucketInfo = serde_json::from_str(json).unwrap(); + assert_eq!(info.used_storage, 1024); + assert_eq!(info.total_files, 3); + } + + #[test] + fn bucket_overview_deserializes() { + let json = r#"{ + "_id": "66079f1a2e4b3c001a2b3c4d", + "id": "myuser/my-bucket", + "author": "myuser", + "private": false, + "repoType": "bucket", + "createdAt": "2024-03-30T12:00:00.000Z", + "updatedAt": "2024-03-31T08:30:00.000Z", + "size": 104857600, + "totalFiles": 42, + "cdnRegions": [{"provider": "gcp", "region": "us"}], + "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} + }"#; + let overview: BucketOverview = serde_json::from_str(json).unwrap(); + assert_eq!(overview.id, "myuser/my-bucket"); + assert_eq!(overview.total_files, 42); + assert_eq!(overview.resource_group.unwrap().name, "ml-team"); + } + + #[test] + fn batch_op_serializes_with_type_tag() { + let op = BatchOp::AddFile(AddFileOp { + path: "data/train.parquet".to_string(), + xet_hash: "abc123".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: Some(1711900000), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"addFile""#)); + assert!(s.contains(r#""xetHash":"abc123""#)); + } + + #[test] + fn delete_op_serializes_with_type_tag() { + let op = BatchOp::DeleteFile(DeleteFileOp { + path: "old.parquet".to_string(), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"deleteFile""#)); + } + + #[test] + fn tree_entry_deserializes_file() { + let json = r#"{ + "type": "file", + "path": "data/train.parquet", + "size": 52428800, + "xetHash": "abc123", + "contentType": "application/octet-stream" + }"#; + let entry: TreeEntry = serde_json::from_str(json).unwrap(); + assert!(matches!(entry.entry_type, EntryType::File)); + assert_eq!(entry.xet_hash.unwrap(), "abc123"); + } +} +``` + +- [ ] **Step 2: Run the test to confirm it fails** + +```bash +cargo test -p huggingface_hub bucket_info_deserializes 2>&1 +``` + +Expected: compile error — `BucketInfo` not found. + +- [ ] **Step 3: Add all types above the test module in `types/buckets.rs`** + +Replace the contents of `huggingface_hub/src/types/buckets.rs` with: + +```rust +use serde::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +// --- Parameter types --- + +#[derive(Debug, Clone, TypedBuilder, Serialize)] +pub struct CreateBucketParams { + #[builder(default, setter(strip_option))] + #[serde(skip_serializing_if = "Option::is_none")] + pub private: Option, + #[builder(default, setter(strip_option, into))] + #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] + pub resource_group_id: Option, + #[builder(default)] + #[serde(skip_serializing_if = "Vec::is_empty")] + pub cdn: Vec, +} + +#[derive(Debug, Clone, TypedBuilder, Serialize)] +pub struct UpdateBucketParams { + #[builder(default, setter(strip_option))] + #[serde(skip_serializing_if = "Option::is_none")] + pub private: Option, + #[builder(default, setter(strip_option))] + #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] + pub cdn_regions: Option>, +} + +#[derive(Debug, Clone, TypedBuilder)] +pub struct ListTreeParams { + #[builder(default, setter(strip_option))] + pub limit: Option, + #[builder(default)] + pub recursive: bool, +} + +// --- Response types --- + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketCreated { + pub url: String, + pub name: String, + pub id: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketInfo { + pub id: String, + pub name: String, + pub namespace: String, + pub private: bool, + #[serde(rename = "usedStorage")] + pub used_storage: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + pub cdn: Vec, + pub region: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CdnRegion { + pub provider: String, + pub region: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketOverview { + #[serde(rename = "_id")] + pub mongo_id: String, + pub id: String, + pub author: String, + pub private: Option, + #[serde(rename = "repoType")] + pub repo_type: String, + #[serde(rename = "createdAt")] + pub created_at: String, + #[serde(rename = "updatedAt")] + pub updated_at: String, + pub size: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + #[serde(rename = "cdnRegions")] + pub cdn_regions: Vec, + #[serde(rename = "resourceGroup")] + pub resource_group: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ResourceGroup { + pub id: String, + pub name: String, + #[serde(rename = "numUsers")] + pub num_users: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct XetToken { + pub token: String, + #[serde(rename = "casUrl")] + pub cas_url: String, + #[serde(rename = "expiresAt")] + pub expires_at: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct PathInfo { + pub path: String, + pub size: u64, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + pub mtime: i64, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct TreeEntry { + #[serde(rename = "type")] + pub entry_type: EntryType, + pub path: String, + pub size: Option, + #[serde(rename = "xetHash")] + pub xet_hash: Option, + #[serde(rename = "contentType")] + pub content_type: Option, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EntryType { + File, + Directory, +} + +// --- Batch types --- + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "type")] +pub enum BatchOp { + #[serde(rename = "addFile")] + AddFile(AddFileOp), + #[serde(rename = "deleteFile")] + DeleteFile(DeleteFileOp), +} + +#[derive(Debug, Clone, Serialize)] +pub struct AddFileOp { + pub path: String, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub mtime: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct DeleteFileOp { + pub path: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BatchResult { + pub success: bool, + pub processed: u32, + pub succeeded: u32, + pub failed: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BatchFailure { + pub path: String, + pub error: String, +} + +// --- resolve_file types --- + +#[derive(Debug, Clone)] +pub struct ResolvedFile { + pub url: String, + pub size: Option, + pub xet_hash: Option, + pub etag: Option, + pub last_modified: Option, + pub xet_auth_url: Option, + pub xet_reconstruction_url: Option, +} + +// --- xet_resolve_file type (feature = "xet") --- + +#[cfg(feature = "xet")] +#[derive(Debug, Clone, Deserialize)] +pub struct XetFileInfo { + pub hash: String, + #[serde(rename = "refreshUrl")] + pub refresh_url: String, + #[serde(rename = "reconstructionUrl")] + pub reconstruction_url: String, + pub etag: String, + pub size: u64, + #[serde(rename = "contentType")] + pub content_type: String, +} + +// --- Internal pagination helper (not public) --- + +#[derive(Deserialize)] +pub(crate) struct TreePage { + pub entries: Vec, + #[serde(rename = "nextCursor")] + pub next_cursor: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bucket_info_deserializes() { + let json = r#"{ + "id": "my-bucket", + "name": "my-bucket", + "namespace": "myuser", + "private": false, + "usedStorage": 1024, + "totalFiles": 3, + "cdn": [], + "region": "us-east-1" + }"#; + let info: BucketInfo = serde_json::from_str(json).unwrap(); + assert_eq!(info.used_storage, 1024); + assert_eq!(info.total_files, 3); + } + + #[test] + fn bucket_overview_deserializes() { + let json = r#"{ + "_id": "66079f1a2e4b3c001a2b3c4d", + "id": "myuser/my-bucket", + "author": "myuser", + "private": false, + "repoType": "bucket", + "createdAt": "2024-03-30T12:00:00.000Z", + "updatedAt": "2024-03-31T08:30:00.000Z", + "size": 104857600, + "totalFiles": 42, + "cdnRegions": [{"provider": "gcp", "region": "us"}], + "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} + }"#; + let overview: BucketOverview = serde_json::from_str(json).unwrap(); + assert_eq!(overview.id, "myuser/my-bucket"); + assert_eq!(overview.total_files, 42); + assert_eq!(overview.resource_group.unwrap().name, "ml-team"); + } + + #[test] + fn batch_op_serializes_with_type_tag() { + let op = BatchOp::AddFile(AddFileOp { + path: "data/train.parquet".to_string(), + xet_hash: "abc123".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: Some(1711900000), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"addFile""#)); + assert!(s.contains(r#""xetHash":"abc123""#)); + } + + #[test] + fn delete_op_serializes_with_type_tag() { + let op = BatchOp::DeleteFile(DeleteFileOp { + path: "old.parquet".to_string(), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"deleteFile""#)); + } + + #[test] + fn tree_entry_deserializes_file() { + let json = r#"{ + "type": "file", + "path": "data/train.parquet", + "size": 52428800, + "xetHash": "abc123", + "contentType": "application/octet-stream" + }"#; + let entry: TreeEntry = serde_json::from_str(json).unwrap(); + assert!(matches!(entry.entry_type, EntryType::File)); + assert_eq!(entry.xet_hash.unwrap(), "abc123"); + } +} +``` + +- [ ] **Step 4: Wire into `types/mod.rs`** + +In `huggingface_hub/src/types/mod.rs`, add alongside the existing module declarations: + +```rust +pub mod buckets; +``` + +And add to the re-exports at the bottom: + +```rust +pub use buckets::*; +``` + +- [ ] **Step 5: Run the tests to confirm they pass** + +```bash +cargo test -p huggingface_hub bucket_info_deserializes batch_op_serializes tree_entry_deserializes bucket_overview_deserializes delete_op_serializes 2>&1 +``` + +Expected: all 5 tests pass. + +- [ ] **Step 6: Commit** + +```bash +git add huggingface_hub/src/types/buckets.rs huggingface_hub/src/types/mod.rs +git commit -m "feat(types): add bucket types (BucketInfo, BucketOverview, BatchOp, TreeEntry, etc.)" +``` + +--- + +## Task 3: Add `HFBucket` struct and wire up modules + +**Files:** +- Modify: `huggingface_hub/src/repository.rs` +- Create (skeleton): `huggingface_hub/src/api/buckets.rs` +- Modify: `huggingface_hub/src/api/mod.rs` +- Modify: `huggingface_hub/src/client.rs` +- Modify: `huggingface_hub/src/lib.rs` + +- [ ] **Step 1: Write a failing test for `HFClient::bucket()` constructor** + +In `huggingface_hub/src/api/buckets.rs` (new file, skeleton only for now): + +```rust +#[cfg(test)] +mod tests { + use crate::HFClientBuilder; + + #[test] + fn bucket_constructor_sets_namespace_and_repo() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.namespace, "myuser"); + assert_eq!(bucket.repo, "my-bucket"); + } +} +``` + +- [ ] **Step 2: Run the test to confirm it fails** + +```bash +cargo test -p huggingface_hub bucket_constructor_sets_namespace_and_repo 2>&1 +``` + +Expected: compile error — `client.bucket` does not exist. + +- [ ] **Step 3: Add `HFBucket` struct to `repository.rs`** + +In `huggingface_hub/src/repository.rs`, add after the `HFSpace` struct definition (or after `HFRepository`, following the same pattern): + +```rust +/// Handle for operations on a single HuggingFace Storage Bucket. +/// +/// Obtain via [`HFClient::bucket`]. Every method adds `Authorization: Bearer ` +/// using the token configured on the client. +#[derive(Clone)] +pub struct HFBucket { + pub(crate) client: crate::HFClient, + pub namespace: String, + pub repo: String, +} +``` + +- [ ] **Step 4: Add `HFClient::bucket()` to `client.rs`** + +In `huggingface_hub/src/client.rs`, add alongside the existing `model()`, `dataset()`, `space()` methods: + +```rust +/// Creates a handle for operations on a single Storage Bucket. +/// No I/O is performed. +pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> crate::repository::HFBucket { + crate::repository::HFBucket { + client: self.clone(), + namespace: namespace.into(), + repo: repo.into(), + } +} +``` + +- [ ] **Step 5: Wire `api/buckets.rs` into `api/mod.rs`** + +In `huggingface_hub/src/api/mod.rs`, add: + +```rust +pub mod buckets; +``` + +- [ ] **Step 6: Export `HFBucket` from `lib.rs`** + +In `huggingface_hub/src/lib.rs`, ensure `HFBucket` is included in the `repository` re-export. It will be exported automatically if `lib.rs` already has `pub use repository::*;`. Verify this line exists; if not, add it. + +- [ ] **Step 7: Run the test to confirm it passes** + +```bash +cargo test -p huggingface_hub bucket_constructor_sets_namespace_and_repo 2>&1 +``` + +Expected: `test api::buckets::tests::bucket_constructor_sets_namespace_and_repo ... ok` + +- [ ] **Step 8: Commit** + +```bash +git add huggingface_hub/src/repository.rs huggingface_hub/src/api/buckets.rs \ + huggingface_hub/src/api/mod.rs huggingface_hub/src/client.rs \ + huggingface_hub/src/lib.rs +git commit -m "feat(bucket): add HFBucket struct and client.bucket() constructor" +``` + +--- + +## Task 4: Bucket CRUD — `get`, `delete`, `update_settings` + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write failing tests** + +Add to the `tests` module in `huggingface_hub/src/api/buckets.rs`: + +```rust +#[cfg(test)] +mod tests { + use crate::HFClientBuilder; + + #[test] + fn bucket_constructor_sets_namespace_and_repo() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.namespace, "myuser"); + assert_eq!(bucket.repo, "my-bucket"); + } + + #[test] + fn get_bucket_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/api/buckets/{}/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(url.ends_with("/api/buckets/myuser/my-bucket")); + } + + #[test] + fn update_settings_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/api/buckets/{}/{}/settings", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); + } +} +``` + +- [ ] **Step 2: Run the tests to confirm they fail** + +```bash +cargo test -p huggingface_hub get_bucket_url update_settings_url 2>&1 +``` + +Expected: compile error — `bucket.client.inner` not accessible. + +- [ ] **Step 3: Add `check_bucket_response` helper and implement CRUD methods** + +Replace `huggingface_hub/src/api/buckets.rs` with: + +```rust +use std::collections::VecDeque; + +use futures::{Stream, StreamExt}; + +use crate::error::{HFError, NotFoundContext}; +use crate::repository::HFBucket; +use crate::types::{ + BatchOp, BatchResult, BucketCreated, BucketInfo, BucketOverview, CreateBucketParams, + ListTreeParams, PathInfo, ResolvedFile, TreeEntry, TreePage, UpdateBucketParams, XetToken, +}; +use crate::{HFClient, Result}; + +/// Maps HTTP status codes to `HFError` variants for bucket API responses. +/// Bucket-level 404s map to `RepoNotFound`; file-level 404s map to `EntryNotFound`. +async fn check_bucket_response( + response: reqwest::Response, + repo_id: &str, + not_found_ctx: NotFoundContext, +) -> Result { + if response.status().is_success() { + return Ok(response); + } + let status = response.status().as_u16(); + let url = response.url().to_string(); + let body = response.text().await.unwrap_or_default(); + Err(match status { + 401 => HFError::AuthRequired, + 403 => HFError::Forbidden, + 404 => match not_found_ctx { + NotFoundContext::Repo => HFError::RepoNotFound { + repo_id: repo_id.to_string(), + }, + NotFoundContext::Entry { path } => HFError::EntryNotFound { + path, + repo_id: repo_id.to_string(), + }, + _ => HFError::Http { status, url, body }, + }, + 409 => HFError::Conflict(body), + 429 => HFError::RateLimited, + 507 => HFError::QuotaExceeded, + _ => HFError::Http { status, url, body }, + }) +} + +impl HFBucket { + fn repo_id(&self) -> String { + format!("{}/{}", self.namespace, self.repo) + } + + fn bucket_url(&self) -> String { + format!( + "{}/api/buckets/{}/{}", + self.client.inner.endpoint, self.namespace, self.repo + ) + } + + /// Returns metadata about this bucket. + pub async fn get(&self) -> Result { + let resp = self + .client + .inner + .client + .get(self.bucket_url()) + .headers(self.client.auth_headers()) + .send() + .await + .map_err(HFError::Request)?; + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + resp.json().await.map_err(HFError::Json) + } + + /// Permanently deletes this bucket and all its files. + pub async fn delete(&self) -> Result<()> { + let resp = self + .client + .inner + .client + .delete(self.bucket_url()) + .headers(self.client.auth_headers()) + .send() + .await + .map_err(HFError::Request)?; + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(()) + } + + /// Updates visibility or CDN configuration for this bucket. + pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> { + let resp = self + .client + .inner + .client + .put(format!("{}/settings", self.bucket_url())) + .headers(self.client.auth_headers()) + .json(¶ms) + .send() + .await + .map_err(HFError::Request)?; + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::HFClientBuilder; + + #[test] + fn bucket_constructor_sets_namespace_and_repo() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.namespace, "myuser"); + assert_eq!(bucket.repo, "my-bucket"); + } + + #[test] + fn get_bucket_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/api/buckets/{}/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(url.ends_with("/api/buckets/myuser/my-bucket")); + } + + #[test] + fn update_settings_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/api/buckets/{}/{}/settings", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); + } +} +``` + +- [ ] **Step 4: Run the tests to confirm they pass** + +```bash +cargo test -p huggingface_hub get_bucket_url update_settings_url bucket_constructor 2>&1 +``` + +Expected: all 3 tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): add get, delete, update_settings with check_bucket_response helper" +``` + +--- + +## Task 5: `HFClient::create_bucket` and `HFClient::list_buckets` + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` +- Modify: `huggingface_hub/src/client.rs` + +- [ ] **Step 1: Write failing tests** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[test] + fn create_bucket_url() { + let client = HFClientBuilder::new().build().unwrap(); + let url = format!( + "{}/api/buckets/{}/{}", + client.inner.endpoint, "myuser", "new-bucket" + ); + assert!(url.ends_with("/api/buckets/myuser/new-bucket")); + } + + #[test] + fn list_buckets_url() { + let client = HFClientBuilder::new().build().unwrap(); + let url = format!("{}/api/buckets/{}", client.inner.endpoint, "myuser"); + assert!(url.ends_with("/api/buckets/myuser")); + } +``` + +- [ ] **Step 2: Run the tests to confirm they fail** + +```bash +cargo test -p huggingface_hub create_bucket_url list_buckets_url 2>&1 +``` + +Expected: compile error — `client.inner` not accessible from test or methods not found. + +- [ ] **Step 3: Add `create_bucket` and `list_buckets` to `client.rs`** + +In `huggingface_hub/src/client.rs`, add the following imports at the top if not already present: + +```rust +use url::Url; +``` + +Then add the new methods on `HFClient` (alongside `bucket()`): + +```rust +/// Creates a new bucket owned by `namespace`. +pub async fn create_bucket( + &self, + namespace: &str, + repo: &str, + params: crate::types::CreateBucketParams, +) -> crate::Result { + let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); + let resp = self + .inner + .client + .post(&url) + .headers(self.auth_headers()) + .json(¶ms) + .send() + .await + .map_err(crate::HFError::Request)?; + let repo_id = format!("{}/{}", namespace, repo); + let resp = crate::api::buckets::check_bucket_response( + resp, + &repo_id, + crate::error::NotFoundContext::Repo, + ) + .await?; + resp.json().await.map_err(crate::HFError::Json) +} + +/// Returns a paginated stream of all buckets owned by `namespace`. +/// Pagination is driven by `Link` response headers. +pub fn list_buckets( + &self, + namespace: &str, +) -> impl futures::Stream> + '_ { + let url = Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) + .expect("endpoint is a valid base URL"); + self.paginate(url, vec![], None) +} +``` + +Note: `check_bucket_response` needs to be `pub(crate)` in `api/buckets.rs`. Change its visibility there: + +```rust +pub(crate) async fn check_bucket_response( ... ) +``` + +- [ ] **Step 4: Run the tests to confirm they pass** + +```bash +cargo test -p huggingface_hub create_bucket_url list_buckets_url 2>&1 +``` + +Expected: both tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs huggingface_hub/src/client.rs +git commit -m "feat(bucket): add HFClient::create_bucket and list_buckets" +``` + +--- + +## Task 6: `batch_files` — NDJSON serialization + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write failing tests** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[test] + fn batch_files_ndjson_adds_before_deletes() { + use crate::types::{AddFileOp, BatchOp, DeleteFileOp}; + + let ops = vec![ + BatchOp::DeleteFile(DeleteFileOp { path: "old.parquet".to_string() }), + BatchOp::AddFile(AddFileOp { + path: "new.parquet".to_string(), + xet_hash: "abc".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: None, + }), + ]; + // Partition and serialize: adds must come first regardless of input order + let (adds, deletes): (Vec<_>, Vec<_>) = + ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + let ndjson: String = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>() + .unwrap(); + let lines: Vec<&str> = ndjson.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].contains("addFile"), "first line must be addFile, got: {}", lines[0]); + assert!(lines[1].contains("deleteFile"), "second line must be deleteFile"); + } + + #[test] + fn batch_files_each_line_ends_with_newline() { + use crate::types::{AddFileOp, BatchOp}; + let ops = vec![BatchOp::AddFile(AddFileOp { + path: "f.parquet".to_string(), + xet_hash: "h".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: None, + })]; + let (adds, deletes): (Vec<_>, Vec<_>) = + ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + let ndjson: String = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>() + .unwrap(); + assert!(ndjson.ends_with('\n')); + } +``` + +- [ ] **Step 2: Run the tests to confirm they pass (logic is already testable)** + +```bash +cargo test -p huggingface_hub batch_files_ndjson batch_files_each_line 2>&1 +``` + +These tests only exercise the serialization logic which uses already-present types. They should compile and pass. If they don't compile, check that `BatchOp`, `AddFileOp`, `DeleteFileOp` are in scope. + +- [ ] **Step 3: Implement `batch_files` on `HFBucket`** + +Add the following method to the `impl HFBucket` block in `api/buckets.rs`: + +```rust + /// Adds and/or removes files in a single atomic operation. + /// + /// All `AddFile` operations are sent before `DeleteFile` operations, as required + /// by the batch protocol. The input order within each group is preserved. + pub async fn batch_files(&self, ops: Vec) -> Result { + let (adds, deletes): (Vec<_>, Vec<_>) = + ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + + let ndjson = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>() + .map_err(HFError::Json)?; + + let resp = self + .client + .inner + .client + .post(format!("{}/batch", self.bucket_url())) + .headers(self.client.auth_headers()) + .header("content-type", "application/x-ndjson") + .body(ndjson) + .send() + .await + .map_err(HFError::Request)?; + + let resp = + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + resp.json().await.map_err(HFError::Json) + } +``` + +Also add `use serde_json;` at the top of `api/buckets.rs` if not already present. + +- [ ] **Step 4: Run all bucket tests** + +```bash +cargo test -p huggingface_hub batch_files 2>&1 +``` + +Expected: both tests pass, no compile errors. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): implement batch_files with NDJSON add-before-delete ordering" +``` + +--- + +## Task 7: `list_tree` — cursor-in-body streaming pagination + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write a failing test for URL construction** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[test] + fn list_tree_url_empty_path() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = if "".is_empty() { + format!( + "{}/api/buckets/{}/{}/tree", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ) + } else { + format!( + "{}/api/buckets/{}/{}/tree/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "some/path" + ) + }; + assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree")); + } + + #[test] + fn list_tree_url_with_path() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let path = "data/sub"; + let url = format!( + "{}/api/buckets/{}/{}/tree/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, path + ); + assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree/data/sub")); + } +``` + +- [ ] **Step 2: Run the tests to confirm they pass (URL logic is trivially testable)** + +```bash +cargo test -p huggingface_hub list_tree_url 2>&1 +``` + +Expected: both URL tests pass. + +- [ ] **Step 3: Implement `list_tree` on `HFBucket`** + +Add the following to the `impl HFBucket` block in `api/buckets.rs`. This uses `try_unfold` with a `VecDeque` buffer to yield one `TreeEntry` at a time while fetching pages lazily: + +```rust + /// Lists files and directories, yielding one entry at a time. + /// + /// Uses cursor-in-body pagination: the stream fetches the next page automatically + /// when the current page's entries are exhausted. No request is made until the + /// first item is polled. + pub fn list_tree( + &self, + path: &str, + params: ListTreeParams, + ) -> impl Stream> + '_ { + let base_url = if path.is_empty() { + format!( + "{}/api/buckets/{}/{}/tree", + self.client.inner.endpoint, self.namespace, self.repo + ) + } else { + format!( + "{}/api/buckets/{}/{}/tree/{}", + self.client.inner.endpoint, self.namespace, self.repo, path + ) + }; + let repo_id = self.repo_id(); + + // State: (buffered entries from current page, cursor for next page, whether we've fetched at all) + // cursor=None + fetched=false → fetch first page (no cursor param) + // cursor=Some(c) + fetched=_ → fetch next page with ?cursor=c + // cursor=None + fetched=true → no more pages, drain buffer then end + futures::stream::try_unfold( + (VecDeque::::new(), None::, false), + move |(mut pending, cursor, fetched)| { + let client = self.client.clone(); + let repo_id = repo_id.clone(); + let base_url = base_url.clone(); + async move { + // Yield buffered items before fetching a new page + if let Some(entry) = pending.pop_front() { + return Ok(Some((entry, (pending, cursor, fetched)))); + } + // No buffered items. Are there more pages to fetch? + if fetched && cursor.is_none() { + return Ok(None); + } + // Fetch next (or first) page + let mut req = client + .inner + .client + .get(&base_url) + .headers(client.auth_headers()); + if let Some(ref c) = cursor { + req = req.query(&[("cursor", c.as_str())]); + } + if let Some(l) = params.limit { + req = req.query(&[("limit", l.to_string().as_str())]); + } + if params.recursive { + req = req.query(&[("recursive", "true")]); + } + let resp = req.send().await.map_err(HFError::Request)?; + let resp = + check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + let page: TreePage = resp.json().await.map_err(HFError::Json)?; + let next_cursor = page.next_cursor; + pending.extend(page.entries); + if let Some(entry) = pending.pop_front() { + Ok(Some((entry, (pending, next_cursor, true)))) + } else { + Ok(None) + } + } + }, + ) + } +``` + +Ensure `use std::collections::VecDeque;` is at the top of the file (it was included in Task 4). + +- [ ] **Step 4: Run all bucket tests** + +```bash +cargo test -p huggingface_hub list_tree 2>&1 +``` + +Expected: all `list_tree_url_*` tests pass, no compile errors. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): implement list_tree with cursor-in-body streaming pagination" +``` + +--- + +## Task 8: `get_paths_info`, `get_xet_write_token`, `get_xet_read_token` + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write failing tests** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[test] + fn xet_token_urls() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let write_url = format!( + "{}/api/buckets/{}/{}/xet-write-token", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + let read_url = format!( + "{}/api/buckets/{}/{}/xet-read-token", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(write_url.ends_with("/xet-write-token")); + assert!(read_url.ends_with("/xet-read-token")); + } + + #[test] + fn paths_info_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/api/buckets/{}/{}/paths-info", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + assert!(url.ends_with("/paths-info")); + } +``` + +- [ ] **Step 2: Run the tests to confirm they pass** + +```bash +cargo test -p huggingface_hub xet_token_urls paths_info_url 2>&1 +``` + +Expected: both tests pass (URL construction tests don't need the methods yet). + +- [ ] **Step 3: Implement the three methods on `HFBucket`** + +Add to the `impl HFBucket` block in `api/buckets.rs`: + +```rust + /// Returns metadata for a batch of file paths. + pub async fn get_paths_info(&self, paths: Vec) -> Result> { + #[derive(serde::Serialize)] + struct Body { + paths: Vec, + } + + let resp = self + .client + .inner + .client + .post(format!("{}/paths-info", self.bucket_url())) + .headers(self.client.auth_headers()) + .json(&Body { paths }) + .send() + .await + .map_err(HFError::Request)?; + + let resp = check_bucket_response( + resp, + &self.repo_id(), + NotFoundContext::Entry { path: String::new() }, + ) + .await?; + resp.json().await.map_err(HFError::Json) + } + + /// Returns a short-lived JWT for uploading files to the Xet CAS. + /// Use the returned `cas_url` and `token` to push file bytes before calling `batch_files`. + pub async fn get_xet_write_token(&self) -> Result { + let resp = self + .client + .inner + .client + .get(format!("{}/xet-write-token", self.bucket_url())) + .headers(self.client.auth_headers()) + .send() + .await + .map_err(HFError::Request)?; + let resp = + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + resp.json().await.map_err(HFError::Json) + } + + /// Returns a short-lived JWT for downloading files from the Xet CAS directly. + pub async fn get_xet_read_token(&self) -> Result { + let resp = self + .client + .inner + .client + .get(format!("{}/xet-read-token", self.bucket_url())) + .headers(self.client.auth_headers()) + .send() + .await + .map_err(HFError::Request)?; + let resp = + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + resp.json().await.map_err(HFError::Json) + } +``` + +- [ ] **Step 4: Run all bucket tests** + +```bash +cargo test -p huggingface_hub -p huggingface_hub 2>&1 | grep "bucket\|FAILED\|ok" +``` + +Expected: all bucket tests pass, no new failures. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): implement get_paths_info, get_xet_write_token, get_xet_read_token" +``` + +--- + +## Task 9: `resolve_file` — redirect capture with header extraction + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write a failing test for `resolve_file` header parsing** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[test] + fn resolve_file_parses_link_header() { + // Verify the Link header parsing logic for xet-auth and xet-reconstruction-info + let link = r#"; rel="xet-auth", ; rel="xet-reconstruction-info""#; + let mut xet_auth = None; + let mut xet_reconstruction = None; + for part in link.split(',') { + let part = part.trim(); + if let Some((url_part, rel_part)) = part.split_once(';') { + let url = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); + let rel = rel_part.trim(); + if rel.contains("xet-auth") { + xet_auth = Some(url); + } else if rel.contains("xet-reconstruction-info") { + xet_reconstruction = Some(url); + } + } + } + assert_eq!(xet_auth.unwrap(), "https://auth.example.com/token"); + assert_eq!( + xet_reconstruction.unwrap(), + "https://xet.example.com/reconstruct/abc" + ); + } + + #[test] + fn resolve_file_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + // Note: no /api/ prefix for resolve + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + ); + assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); + assert!(!url.contains("/api/")); + } +``` + +- [ ] **Step 2: Run the tests to confirm the link parsing test passes** + +```bash +cargo test -p huggingface_hub resolve_file_parses_link resolve_file_url 2>&1 +``` + +Expected: both tests pass (they test pure logic, no network). + +- [ ] **Step 3: Implement `resolve_file` on `HFBucket`** + +Add to the `impl HFBucket` block in `api/buckets.rs`: + +```rust + /// Resolves a file path to a direct download URL. + /// + /// Uses the no-redirect client to capture the 302 `Location` header rather than + /// following it. Metadata is extracted from response headers: + /// `X-Linked-Size`, `X-XET-Hash`, `X-Linked-ETag`, `Last-Modified`, and `Link`. + pub async fn resolve_file(&self, path: &str) -> Result { + // Note: no /api/ prefix — this is the file-serving route, not the metadata API. + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + self.client.inner.endpoint, self.namespace, self.repo, path + ); + let resp = self + .client + .inner + .no_redirect_client + .get(&url) + .headers(self.client.auth_headers()) + .send() + .await + .map_err(HFError::Request)?; + + if !resp.status().is_redirection() { + return Err( + check_bucket_response( + resp, + &self.repo_id(), + NotFoundContext::Entry { path: path.to_string() }, + ) + .await + .unwrap_err(), + ); + } + + let headers = resp.headers(); + + let location = headers + .get("location") + .and_then(|v| v.to_str().ok()) + .map(str::to_owned) + .ok_or_else(|| HFError::Http { + status: resp.status().as_u16(), + url: url.clone(), + body: "missing Location header".to_string(), + })?; + + let size = headers + .get("x-linked-size") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse::().ok()); + + let xet_hash = headers + .get("x-xet-hash") + .and_then(|v| v.to_str().ok()) + .map(str::to_owned); + + let etag = headers + .get("x-linked-etag") + .and_then(|v| v.to_str().ok()) + .map(str::to_owned); + + let last_modified = headers + .get("last-modified") + .and_then(|v| v.to_str().ok()) + .map(str::to_owned); + + // Parse Link header: ; rel="xet-auth", ; rel="xet-reconstruction-info" + let mut xet_auth_url = None; + let mut xet_reconstruction_url = None; + if let Some(link) = headers.get("link").and_then(|v| v.to_str().ok()) { + for part in link.split(',') { + let part = part.trim(); + if let Some((url_part, rel_part)) = part.split_once(';') { + let u = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); + if rel_part.contains("xet-auth") { + xet_auth_url = Some(u); + } else if rel_part.contains("xet-reconstruction-info") { + xet_reconstruction_url = Some(u); + } + } + } + } + + Ok(ResolvedFile { + url: location, + size, + xet_hash, + etag, + last_modified, + xet_auth_url, + xet_reconstruction_url, + }) + } +``` + +- [ ] **Step 4: Run all bucket tests** + +```bash +cargo test -p huggingface_hub 2>&1 | grep -E "bucket|resolve|FAILED|error" +``` + +Expected: all existing tests still pass, no compile errors. + +- [ ] **Step 5: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): implement resolve_file with redirect capture and header extraction" +``` + +--- + +## Task 10: `xet_resolve_file` (feature = `"xet"`) + +**Files:** +- Modify: `huggingface_hub/src/api/buckets.rs` + +- [ ] **Step 1: Write a failing test** + +Add to the `tests` module in `api/buckets.rs`: + +```rust + #[cfg(feature = "xet")] + #[test] + fn xet_resolve_file_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + // Same URL as resolve_file — Accept header determines the response format + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + ); + assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); + } +``` + +- [ ] **Step 2: Run the test under the xet feature** + +```bash +cargo test -p huggingface_hub --features xet xet_resolve_file_url 2>&1 +``` + +Expected: compile error — `xet_resolve_file` method not found (the test will compile but the URL test itself may pass; confirm `XetFileInfo` is missing). + +- [ ] **Step 3: Implement `xet_resolve_file` on `HFBucket`** + +Add to the `impl HFBucket` block in `api/buckets.rs`: + +```rust + /// Resolves a file path and returns Xet reconstruction metadata. + /// + /// Sends `Accept: application/vnd.xet-fileinfo+json` to request the JSON response + /// instead of a redirect. Use the returned `reconstruction_url` to fetch chunk data + /// from the Xet CAS directly. + #[cfg(feature = "xet")] + pub async fn xet_resolve_file(&self, path: &str) -> Result { + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + self.client.inner.endpoint, self.namespace, self.repo, path + ); + let resp = self + .client + .inner + .client + .get(&url) + .headers(self.client.auth_headers()) + .header("accept", "application/vnd.xet-fileinfo+json") + .send() + .await + .map_err(HFError::Request)?; + let resp = check_bucket_response( + resp, + &self.repo_id(), + NotFoundContext::Entry { path: path.to_string() }, + ) + .await?; + resp.json().await.map_err(HFError::Json) + } +``` + +- [ ] **Step 4: Run tests with the xet feature** + +```bash +cargo test -p huggingface_hub --features xet xet_resolve_file_url 2>&1 +``` + +Expected: test passes. + +- [ ] **Step 5: Confirm the build still works without the xet feature** + +```bash +cargo build -p huggingface_hub 2>&1 +``` + +Expected: compiles cleanly (no xet feature). + +- [ ] **Step 6: Commit** + +```bash +git add huggingface_hub/src/api/buckets.rs +git commit -m "feat(bucket): implement xet_resolve_file (feature = xet)" +``` + +--- + +## Task 11: Blocking wrappers (`HFBucketSync`) + +**Files:** +- Modify: `huggingface_hub/src/blocking.rs` +- Modify: `huggingface_hub/src/lib.rs` + +- [ ] **Step 1: Write a failing test** + +Add to `huggingface_hub/src/blocking.rs` (inside `#[cfg(test)]` if one exists, or add a new one): + +```rust +#[cfg(test)] +mod bucket_tests { + #[cfg(feature = "blocking")] + #[test] + fn bucket_sync_constructor() { + use crate::HFClientBuilder; + let client = crate::blocking::HFClientSync::from(HFClientBuilder::new().build().unwrap()); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.inner.namespace, "myuser"); + assert_eq!(bucket.inner.repo, "my-bucket"); + } +} +``` + +- [ ] **Step 2: Run the test to confirm it fails** + +```bash +cargo test -p huggingface_hub --features blocking bucket_sync_constructor 2>&1 +``` + +Expected: compile error — `HFClientSync::bucket` does not exist. + +- [ ] **Step 3: Add `HFBucketSync` struct to `blocking.rs`** + +In `huggingface_hub/src/blocking.rs`, add alongside `HFRepositorySync` and `HFSpaceSync`: + +```rust +/// Synchronous handle for Storage Bucket operations. +/// +/// Obtain via [`HFClientSync::bucket`]. All methods block the current thread. +#[cfg(feature = "blocking")] +#[derive(Clone)] +pub struct HFBucketSync { + pub(crate) inner: crate::repository::HFBucket, + pub(crate) runtime: std::sync::Arc, +} +``` + +- [ ] **Step 4: Add `HFClientSync::bucket()` in `blocking.rs`** + +In `blocking.rs`, find the `impl HFClientSync` block and add: + +```rust + /// Creates a synchronous bucket handle. + pub fn bucket( + &self, + namespace: impl Into, + repo: impl Into, + ) -> HFBucketSync { + HFBucketSync { + inner: self.inner.bucket(namespace, repo), + runtime: self.runtime.clone(), + } + } +``` + +- [ ] **Step 5: Add blocking methods to `HFBucketSync` in `blocking.rs`** + +Add an `impl HFBucketSync` block: + +```rust +#[cfg(feature = "blocking")] +impl HFBucketSync { + pub fn get(&self) -> crate::Result { + self.runtime.block_on(self.inner.get()) + } + + pub fn delete(&self) -> crate::Result<()> { + self.runtime.block_on(self.inner.delete()) + } + + pub fn update_settings( + &self, + params: crate::types::UpdateBucketParams, + ) -> crate::Result<()> { + self.runtime.block_on(self.inner.update_settings(params)) + } + + pub fn batch_files( + &self, + ops: Vec, + ) -> crate::Result { + self.runtime.block_on(self.inner.batch_files(ops)) + } + + pub fn list_tree( + &self, + path: &str, + params: crate::types::ListTreeParams, + ) -> crate::Result> { + use futures::StreamExt; + self.runtime.block_on(async { + let stream = self.inner.list_tree(path, params); + futures::pin_mut!(stream); + let mut items = Vec::new(); + while let Some(item) = stream.next().await { + items.push(item?); + } + Ok(items) + }) + } + + pub fn get_paths_info( + &self, + paths: Vec, + ) -> crate::Result> { + self.runtime.block_on(self.inner.get_paths_info(paths)) + } + + pub fn get_xet_write_token(&self) -> crate::Result { + self.runtime.block_on(self.inner.get_xet_write_token()) + } + + pub fn get_xet_read_token(&self) -> crate::Result { + self.runtime.block_on(self.inner.get_xet_read_token()) + } + + pub fn resolve_file(&self, path: &str) -> crate::Result { + self.runtime.block_on(self.inner.resolve_file(path)) + } + + #[cfg(feature = "xet")] + pub fn xet_resolve_file(&self, path: &str) -> crate::Result { + self.runtime.block_on(self.inner.xet_resolve_file(path)) + } +} +``` + +Also add a `list_buckets` blocking method on `HFClientSync`. Find the `impl HFClientSync` block and add: + +```rust + pub fn list_buckets( + &self, + namespace: &str, + ) -> crate::Result> { + use futures::StreamExt; + self.runtime.block_on(async { + let stream = self.inner.list_buckets(namespace); + futures::pin_mut!(stream); + let mut items = Vec::new(); + while let Some(item) = stream.next().await { + items.push(item?); + } + Ok(items) + }) + } + + pub fn create_bucket( + &self, + namespace: &str, + repo: &str, + params: crate::types::CreateBucketParams, + ) -> crate::Result { + self.runtime.block_on(self.inner.create_bucket(namespace, repo, params)) + } +``` + +- [ ] **Step 6: Export `HFBucketSync` from `lib.rs`** + +In `huggingface_hub/src/lib.rs`, find the blocking re-export line: + +```rust +#[cfg(feature = "blocking")] +pub use blocking::{HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; +``` + +Add `HFBucketSync` to this list: + +```rust +#[cfg(feature = "blocking")] +pub use blocking::{HFBucketSync, HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; +``` + +- [ ] **Step 7: Run the test to confirm it passes** + +```bash +cargo test -p huggingface_hub --features blocking bucket_sync_constructor 2>&1 +``` + +Expected: test passes. + +- [ ] **Step 8: Run the full test suite to check for regressions** + +```bash +cargo test -p huggingface_hub --features blocking 2>&1 | grep -E "FAILED|error\[" | head -20 +``` + +Expected: no failures. + +- [ ] **Step 9: Commit** + +```bash +git add huggingface_hub/src/blocking.rs huggingface_hub/src/lib.rs +git commit -m "feat(bucket): add HFBucketSync blocking wrappers" +``` + +--- + +## Task 12: Integration tests + +**Files:** +- Modify: `huggingface_hub/tests/integration_test.rs` + +- [ ] **Step 1: Write the integration tests (they will be skipped without credentials)** + +Add the following to `huggingface_hub/tests/integration_test.rs`. The `api()` and `write_enabled()` helpers are already defined in the file; add only the new test functions: + +```rust +// ---- HFBucket integration tests ---- + +/// Helper: creates a unique test bucket name to avoid collisions between runs. +fn test_bucket_name() -> String { + format!( + "test-bucket-{}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ) +} + +#[tokio::test] +async fn test_list_buckets() { + let Some(api) = api() else { return }; + let username = cached_username().await; + // list_buckets is a read operation — no HF_TEST_WRITE required + let buckets: Vec<_> = api + .list_buckets(username) + .collect::>() + .await + .into_iter() + .collect::>>() + .expect("list_buckets failed"); + // Simply assert the call succeeds; the user may have zero buckets + let _ = buckets; +} + +#[tokio::test] +async fn test_create_and_delete_bucket() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + // Create + let created = api + .create_bucket( + username, + &name, + huggingface_hub::CreateBucketParams::builder().private(true).build(), + ) + .await + .expect("create_bucket failed"); + assert!(created.id.contains(&name)); + + // Get + let bucket = api.bucket(username, &name); + let info = bucket.get().await.expect("get failed"); + assert_eq!(info.name, name); + assert!(info.private); + + // Update settings + bucket + .update_settings( + huggingface_hub::UpdateBucketParams::builder().private(false).build(), + ) + .await + .expect("update_settings failed"); + + let info = bucket.get().await.unwrap(); + assert!(!info.private); + + // Delete + bucket.delete().await.expect("delete failed"); + + // Confirm gone + assert!(matches!( + bucket.get().await, + Err(huggingface_hub::HFError::RepoNotFound { .. }) + )); +} + +#[tokio::test] +async fn test_bucket_list_tree_empty() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + api.create_bucket( + username, + &name, + huggingface_hub::CreateBucketParams::builder().build(), + ) + .await + .expect("create_bucket failed"); + + let bucket = api.bucket(username, &name); + + let entries: Vec<_> = bucket + .list_tree("", huggingface_hub::ListTreeParams::builder().build()) + .collect::>() + .await + .into_iter() + .collect::>>() + .expect("list_tree failed"); + + assert!(entries.is_empty(), "new bucket should have no files"); + + bucket.delete().await.unwrap(); +} + +#[tokio::test] +async fn test_get_xet_write_and_read_token() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + api.create_bucket( + username, + &name, + huggingface_hub::CreateBucketParams::builder().build(), + ) + .await + .unwrap(); + + let bucket = api.bucket(username, &name); + + let write_tok = bucket.get_xet_write_token().await.expect("xet write token failed"); + assert!(!write_tok.token.is_empty()); + assert!(!write_tok.cas_url.is_empty()); + + let read_tok = bucket.get_xet_read_token().await.expect("xet read token failed"); + assert!(!read_tok.token.is_empty()); + + bucket.delete().await.unwrap(); +} +``` + +- [ ] **Step 2: Run the integration tests without credentials (they should be skipped)** + +```bash +cargo test -p huggingface_hub --test integration_test test_list_buckets test_create_and_delete_bucket test_bucket_list_tree test_get_xet 2>&1 +``` + +Expected: all 4 tests report "ok" (they exit early due to missing `HF_TOKEN`). + +- [ ] **Step 3: Run the full library test suite to check for regressions** + +```bash +cargo test -p huggingface_hub 2>&1 | grep -E "FAILED|error\[" | head -20 +``` + +Expected: no failures. + +- [ ] **Step 4: Commit** + +```bash +git add huggingface_hub/tests/integration_test.rs +git commit -m "test(bucket): add integration tests for create, get, update, delete, list_tree, xet tokens" +``` + +--- + +## Self-Review + +After all tasks are complete, run the full suite one final time: + +```bash +cargo test -p huggingface_hub 2>&1 +cargo test -p huggingface_hub --features blocking 2>&1 +cargo test -p huggingface_hub --features xet 2>&1 +cargo clippy -p huggingface_hub -- -D warnings 2>&1 +``` + +All expected clean. diff --git a/docs/specs/2026-04-08-hf-bucket-rust-client-design.md b/docs/specs/2026-04-08-hf-bucket-rust-client-design.md new file mode 100644 index 0000000..a5da653 --- /dev/null +++ b/docs/specs/2026-04-08-hf-bucket-rust-client-design.md @@ -0,0 +1,437 @@ +# HFBucket Rust Client Design + +**Date:** 2026-04-08 +**Repo:** `huggingface/huggingface_hub_rust` +**Scope:** New `HFBucket` type + `HFClient` extensions + supporting types and error variants + +## Overview + +Add a `HFBucket` type to `huggingface_hub_rust` that exposes the HuggingFace Storage Buckets API (moon-landing). Buckets use content-addressable Xet storage rather than Git, making `HFRepository` the wrong abstraction — `HFBucket` is a separate handle type following the `HFSpace` precedent. + +This spec covers the raw API surface only (option A). Higher-level upload abstractions (wrapping the Xet write token + batch commit flow into a single `upload_file` call) are deferred to a follow-up. + +**Reference implementation:** `s3-gateway/src/hub_client/` in `huggingface/xet-catalogue`. + +--- + +## Module Structure + +Two new files, wired into their respective `mod.rs` files: + +``` +huggingface_hub/src/ +├── api/ +│ └── buckets.rs — HFBucket impl, HFClient::bucket / create_bucket / list_buckets +├── types/ +│ └── buckets.rs — all request/response types +``` + +`lib.rs` exports `HFBucket` at the crate root. No feature flag — buckets are part of the default library surface. + +--- + +## `HFBucket` Type + +```rust +pub struct HFBucket { + pub(crate) inner: Arc, + pub namespace: String, + pub repo: String, +} +``` + +Constructed via `HFClient::bucket()` — no I/O, no allocation beyond the string copies. + +### `HFClient` extensions + +```rust +// Constructs a bucket handle +pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> HFBucket + +// POST /api/buckets/:ns/:repo +pub async fn create_bucket( + &self, + namespace: &str, + repo: &str, + params: CreateBucketParams, +) -> Result + +// GET /api/buckets/:ns — Link-header paginated stream +pub fn list_buckets(&self, namespace: &str) -> impl Stream> +``` + +### `HFBucket` methods + +```rust +// GET /api/buckets/:ns/:repo +pub async fn get(&self) -> Result + +// DELETE /api/buckets/:ns/:repo +pub async fn delete(&self) -> Result<()> + +// PUT /api/buckets/:ns/:repo/settings +pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> + +// POST /api/buckets/:ns/:repo/batch (NDJSON) +pub async fn batch_files(&self, ops: Vec) -> Result + +// GET /api/buckets/:ns/:repo/tree[/:path] — cursor-from-body paginated stream +pub fn list_tree(&self, path: &str, params: ListTreeParams) -> impl Stream> + +// POST /api/buckets/:ns/:repo/paths-info +pub async fn get_paths_info(&self, paths: Vec) -> Result> + +// GET /api/buckets/:ns/:repo/xet-write-token +pub async fn get_xet_write_token(&self) -> Result + +// GET /api/buckets/:ns/:repo/xet-read-token +pub async fn get_xet_read_token(&self) -> Result + +// GET /buckets/:ns/:repo/resolve/:path (no /api/ prefix) +pub async fn resolve_file(&self, path: &str) -> Result + +// GET /buckets/:ns/:repo/resolve/:path with Xet Accept header +#[cfg(feature = "xet")] +pub async fn xet_resolve_file(&self, path: &str) -> Result +``` + +--- + +## Types (`src/types/buckets.rs`) + +### Parameter types + +All use `TypedBuilder`. `cursor` is omitted from list params — streaming handles pagination internally. + +```rust +#[derive(TypedBuilder, Serialize)] +pub struct CreateBucketParams { + #[builder(default, setter(strip_option))] + pub private: Option, + #[builder(default, setter(strip_option, into))] + #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] + pub resource_group_id: Option, + #[builder(default)] + #[serde(skip_serializing_if = "Vec::is_empty")] + pub cdn: Vec, +} + +#[derive(TypedBuilder, Serialize)] +pub struct UpdateBucketParams { + #[builder(default, setter(strip_option))] + #[serde(skip_serializing_if = "Option::is_none")] + pub private: Option, + #[builder(default, setter(strip_option))] + #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] + pub cdn_regions: Option>, +} + +#[derive(TypedBuilder)] +pub struct ListTreeParams { + #[builder(default, setter(strip_option))] + pub limit: Option, + #[builder(default)] + pub recursive: bool, +} +``` + +### Response types + +```rust +#[derive(Debug, Deserialize)] +pub struct BucketCreated { + pub url: String, + pub name: String, + pub id: String, +} + +#[derive(Debug, Deserialize)] +pub struct BucketInfo { + pub id: String, + pub name: String, + pub namespace: String, + pub private: bool, + #[serde(rename = "usedStorage")] + pub used_storage: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + pub cdn: Vec, + pub region: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CdnRegion { + pub provider: String, + pub region: String, +} + +#[derive(Debug, Deserialize)] +pub struct XetToken { + pub token: String, + #[serde(rename = "casUrl")] + pub cas_url: String, + #[serde(rename = "expiresAt")] + pub expires_at: String, +} + +#[derive(Debug, Deserialize)] +pub struct PathInfo { + pub path: String, + pub size: u64, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + pub mtime: i64, +} + +#[derive(Debug, Deserialize)] +pub struct TreeEntry { + #[serde(rename = "type")] + pub entry_type: EntryType, + pub path: String, + pub size: Option, + #[serde(rename = "xetHash")] + pub xet_hash: Option, + #[serde(rename = "contentType")] + pub content_type: Option, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EntryType { File, Directory } +``` + +### `BucketOverview` (returned by `list_buckets`) + +`list_buckets` yields `BucketOverview`, which is distinct from `BucketInfo` returned by `get()`. The `id` field is the full `"namespace/repo"` string. + +```rust +#[derive(Debug, Deserialize)] +pub struct BucketOverview { + #[serde(rename = "_id")] + pub mongo_id: String, + pub id: String, // "namespace/repo" + pub author: String, + pub private: Option, // nullable + #[serde(rename = "repoType")] + pub repo_type: String, // always "bucket" + #[serde(rename = "createdAt")] + pub created_at: String, + #[serde(rename = "updatedAt")] + pub updated_at: String, + pub size: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + #[serde(rename = "cdnRegions")] + pub cdn_regions: Vec, + #[serde(rename = "resourceGroup")] + pub resource_group: Option, +} + +#[derive(Debug, Deserialize)] +pub struct ResourceGroup { + pub id: String, + pub name: String, + #[serde(rename = "numUsers")] + pub num_users: Option, +} +``` + +An internal `TreePage` struct (not public) is used for `list_tree` pagination: + +```rust +#[derive(Deserialize)] +struct TreePage { + entries: Vec, + #[serde(rename = "nextCursor")] + next_cursor: Option, +} +``` + +### Batch types + +The protocol requires all `addFile` entries to precede any `deleteFile` entries. Enforced in `batch_files` via partition before serialization. + +```rust +#[derive(Debug, Serialize)] +#[serde(tag = "type")] +pub enum BatchOp { + #[serde(rename = "addFile")] + AddFile(AddFileOp), + #[serde(rename = "deleteFile")] + DeleteFile(DeleteFileOp), +} + +#[derive(Debug, Serialize)] +pub struct AddFileOp { + pub path: String, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub mtime: Option, +} + +#[derive(Debug, Serialize)] +pub struct DeleteFileOp { + pub path: String, +} + +#[derive(Debug, Deserialize)] +pub struct BatchResult { + pub success: bool, + pub processed: u32, + pub succeeded: u32, + pub failed: Vec, +} + +#[derive(Debug, Deserialize)] +pub struct BatchFailure { + pub path: String, + pub error: String, +} +``` + +### `ResolvedFile` + +Constructed from redirect response headers, not a JSON body. The `Link` header contains two entries identified by `rel` name: + +``` +Link: ; rel="xet-auth", ; rel="xet-reconstruction-info" +``` + +```rust +#[derive(Debug)] +pub struct ResolvedFile { + pub url: String, // Location header + pub size: Option, // X-Linked-Size + pub xet_hash: Option, // X-XET-Hash + pub etag: Option, // X-Linked-ETag + pub last_modified: Option, // Last-Modified + pub xet_auth_url: Option, // Link rel="xet-auth" + pub xet_reconstruction_url: Option, // Link rel="xet-reconstruction-info" +} +``` + +### `XetFileInfo` (feature = `"xet"`) + +```rust +#[derive(Debug, Deserialize)] +pub struct XetFileInfo { + pub hash: String, + #[serde(rename = "refreshUrl")] + pub refresh_url: String, + #[serde(rename = "reconstructionUrl")] + pub reconstruction_url: String, + pub etag: String, + pub size: u64, + #[serde(rename = "contentType")] + pub content_type: String, +} +``` + +--- + +## Pagination + +**`list_buckets`** returns a JSON array with pagination via `Link` response headers (`rel="next"`), identical to the model/dataset list endpoints. The existing `paginate()` helper can be reused directly. + +**`list_tree`** returns a JSON object `{ entries, nextCursor }` with cursor-in-body pagination. It uses `futures::stream::try_unfold` over cursor-from-body pagination, the same pattern as `pagination.rs` but reading `next_cursor` from the deserialized body rather than a `Link` header. + +`list_tree` query params: `limit` (if set), `recursive` (if true), `cursor` (if continuing). Path suffix appended only when non-empty: + +``` +/api/buckets/:ns/:repo/tree (empty path) +/api/buckets/:ns/:repo/tree/:path (non-empty path) +``` + +Both streams are lazy — no HTTP request is made until the caller polls the first item. + +--- + +## `resolve_file` and `xet_resolve_file` + +**`resolve_file`** uses `inner.no_redirect_client` (already on `HFClientInner`) to prevent automatic redirect following. Expects a 3xx response. Reads `Location` and the metadata headers to populate `ResolvedFile`. Any non-redirect response (including 200) is passed to the standard error handler. + +**`xet_resolve_file`** (feature = `"xet"`) uses the regular client. Sends `Accept: application/vnd.xet-fileinfo+json`. Expects a 200 JSON body deserializing to `XetFileInfo`. + +--- + +## `batch_files` NDJSON + +``` +POST /api/buckets/:ns/:repo/batch +Content-Type: application/x-ndjson +``` + +Implementation: +1. Partition `ops` into adds and deletes (preserving within-group order). +2. Serialize each op with `serde_json::to_string`, append `\n`. +3. Concatenate into a single string body — adds first, then deletes. +4. Serialization errors surface as `HFError::Json`. + +--- + +## Error Handling + +Four new variants added to `HFError`. They will only be emitted by bucket API methods in this PR; updating existing non-bucket methods to use them is out of scope. + +```rust +pub enum HFError { + // ... existing variants ... + #[error("forbidden")] + Forbidden, + #[error("conflict: {0}")] + Conflict(String), // carries response body + #[error("rate limited")] + RateLimited, + #[error("quota exceeded")] + QuotaExceeded, +} +``` + +Full status mapping for bucket methods: + +| Status | `HFError` variant | +|--------|-------------------| +| 401 | `AuthRequired` (existing) | +| 403 | `Forbidden` (new) | +| 404 on bucket | `RepoNotFound { repo_id: "ns/repo" }` (existing) | +| 404 on file | `EntryNotFound { path, repo_id: "ns/repo" }` (existing) | +| 409 | `Conflict(body)` (new) | +| 429 | `RateLimited` (new) | +| 507 | `QuotaExceeded` (new) | +| other | `Http { status, url, body }` (existing) | + +The 404 distinction is made at the call site: bucket-level methods (`get`, `delete`, `update_settings`) use `RepoNotFound`; file-level methods (`get_paths_info`, `resolve_file`) use `EntryNotFound`. + +--- + +## Blocking API + +All non-streaming `HFBucket` methods and `HFClient::create_bucket` get sync wrappers via the existing `sync_api!` macro. Streaming methods (`list_buckets`, `list_tree`) use `sync_api_stream!`, which wraps the async stream in a blocking iterator. + +`HFClientSync::bucket()` returns `HFBucketSync`. + +--- + +## Testing + +**Unit tests** in `#[cfg(test)]` within `api/buckets.rs`: +- URL construction for each endpoint (including the path-suffix logic in `list_tree`) +- `resolve_file` header parsing (Location, X-Linked-Size, X-XET-Hash, Link) +- `batch_files` NDJSON ordering (adds before deletes) + +**Integration tests** in `tests/integration_test.rs`, following existing patterns: +- Skip if `HF_TOKEN` absent +- Write operations (`create_bucket`, `batch_files`, `delete`) behind `HF_TEST_WRITE=1` +- Tests create and tear down their own bucket — no dependency on pre-existing test fixtures +- One integration test per public method; `list_buckets` and `list_tree` collect the stream into a `Vec` and assert on shape/contents + +--- + +## Open Items + +None. diff --git a/huggingface_hub/src/api/buckets.rs b/huggingface_hub/src/api/buckets.rs new file mode 100644 index 0000000..89e5c2b --- /dev/null +++ b/huggingface_hub/src/api/buckets.rs @@ -0,0 +1,538 @@ +use std::collections::VecDeque; + +use futures::Stream; + +use crate::error::{HFError, NotFoundContext}; +use crate::repository::HFBucket; +use crate::types::{ + BatchOp, BatchResult, BucketCreated, BucketInfo, BucketOverview, CreateBucketParams, ListTreeParams, PathInfo, + ResolvedFile, TreeEntry, TreePage, UpdateBucketParams, XetToken, +}; +use crate::{HFClient, Result}; + +/// Maps HTTP status codes to `HFError` variants for bucket API responses. +/// Bucket-level 404s map to `RepoNotFound`; file-level 404s map to `EntryNotFound`. +pub(crate) async fn check_bucket_response( + response: reqwest::Response, + repo_id: &str, + not_found_ctx: NotFoundContext, +) -> Result { + if response.status().is_success() { + return Ok(response); + } + let status = response.status(); + let url = response.url().to_string(); + let body = response.text().await.unwrap_or_default(); + Err(match status.as_u16() { + 401 => HFError::AuthRequired, + 403 => HFError::Forbidden, + 404 => match not_found_ctx { + NotFoundContext::Repo => HFError::RepoNotFound { + repo_id: repo_id.to_string(), + }, + NotFoundContext::Entry { path } => HFError::EntryNotFound { + path, + repo_id: repo_id.to_string(), + }, + _ => HFError::Http { status, url, body }, + }, + 409 => HFError::Conflict(body), + 429 => HFError::RateLimited, + 507 => HFError::QuotaExceeded, + _ => HFError::Http { status, url, body }, + }) +} + +impl HFBucket { + fn repo_id(&self) -> String { + format!("{}/{}", self.namespace, self.repo) + } + + fn bucket_url(&self) -> String { + format!("{}/api/buckets/{}/{}", self.client.inner.endpoint, self.namespace, self.repo) + } + + /// Returns metadata about this bucket. + pub async fn get(&self) -> Result { + let resp = self + .client + .inner + .client + .get(self.bucket_url()) + .headers(self.client.auth_headers()) + .send() + .await?; + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(resp.json().await?) + } + + /// Permanently deletes this bucket and all its files. + pub async fn delete(&self) -> Result<()> { + let resp = self + .client + .inner + .client + .delete(self.bucket_url()) + .headers(self.client.auth_headers()) + .send() + .await?; + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(()) + } + + /// Updates visibility or CDN configuration for this bucket. + pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> { + let resp = self + .client + .inner + .client + .put(format!("{}/settings", self.bucket_url())) + .headers(self.client.auth_headers()) + .json(¶ms) + .send() + .await?; + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(()) + } + + /// Adds and/or removes files in a single atomic operation. + /// + /// All `AddFile` operations are sent before `DeleteFile` operations, as required + /// by the batch protocol. The input order within each group is preserved. + pub async fn batch_files(&self, ops: Vec) -> Result { + let (adds, deletes): (Vec<_>, Vec<_>) = ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + + let ndjson = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>()?; + + let resp = self + .client + .inner + .client + .post(format!("{}/batch", self.bucket_url())) + .headers(self.client.auth_headers()) + .header("content-type", "application/x-ndjson") + .body(ndjson) + .send() + .await?; + + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(resp.json().await?) + } + + /// Lists files and directories, yielding one entry at a time. + /// + /// Uses cursor-in-body pagination: the stream fetches the next page automatically + /// when the current page's entries are exhausted. No request is made until the + /// first item is polled. + pub fn list_tree(&self, path: &str, params: ListTreeParams) -> impl Stream> + '_ { + let base_url = if path.is_empty() { + format!("{}/api/buckets/{}/{}/tree", self.client.inner.endpoint, self.namespace, self.repo) + } else { + format!("{}/api/buckets/{}/{}/tree/{}", self.client.inner.endpoint, self.namespace, self.repo, path) + }; + let repo_id = self.repo_id(); + + futures::stream::try_unfold( + (VecDeque::::new(), None::, false), + move |(mut pending, cursor, fetched)| { + let client = self.client.clone(); + let repo_id = repo_id.clone(); + let base_url = base_url.clone(); + async move { + if let Some(entry) = pending.pop_front() { + return Ok(Some((entry, (pending, cursor, fetched)))); + } + if fetched && cursor.is_none() { + return Ok(None); + } + let mut req = client.inner.client.get(&base_url).headers(client.auth_headers()); + if let Some(ref c) = cursor { + req = req.query(&[("cursor", c.as_str())]); + } + if let Some(l) = params.limit { + req = req.query(&[("limit", l.to_string().as_str())]); + } + if params.recursive { + req = req.query(&[("recursive", "true")]); + } + let resp = req.send().await?; + let resp = check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + let page: TreePage = resp.json().await?; + let next_cursor = page.next_cursor; + pending.extend(page.entries); + if let Some(entry) = pending.pop_front() { + Ok(Some((entry, (pending, next_cursor, true)))) + } else { + Ok(None) + } + } + }, + ) + } + + /// Returns metadata for a batch of file paths. + pub async fn get_paths_info(&self, paths: Vec) -> Result> { + #[derive(serde::Serialize)] + struct Body { + paths: Vec, + } + + let resp = self + .client + .inner + .client + .post(format!("{}/paths-info", self.bucket_url())) + .headers(self.client.auth_headers()) + .json(&Body { paths }) + .send() + .await?; + + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Entry { path: String::new() }).await?; + Ok(resp.json().await?) + } + + /// Returns a short-lived JWT for uploading files to the Xet CAS. + /// Use the returned `cas_url` and `token` to push file bytes before calling `batch_files`. + pub async fn get_xet_write_token(&self) -> Result { + let resp = self + .client + .inner + .client + .get(format!("{}/xet-write-token", self.bucket_url())) + .headers(self.client.auth_headers()) + .send() + .await?; + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(resp.json().await?) + } + + /// Returns a short-lived JWT for downloading files from the Xet CAS directly. + pub async fn get_xet_read_token(&self) -> Result { + let resp = self + .client + .inner + .client + .get(format!("{}/xet-read-token", self.bucket_url())) + .headers(self.client.auth_headers()) + .send() + .await?; + let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + Ok(resp.json().await?) + } + + /// Resolves a file path to a direct download URL. + /// + /// Uses the no-redirect client to capture the 302 `Location` header rather than + /// following it. Metadata is extracted from response headers: + /// `X-Linked-Size`, `X-XET-Hash`, `X-Linked-ETag`, `Last-Modified`, and `Link`. + pub async fn resolve_file(&self, path: &str) -> Result { + let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.repo, path); + let resp = self + .client + .inner + .no_redirect_client + .get(&url) + .headers(self.client.auth_headers()) + .send() + .await?; + + if !resp.status().is_redirection() { + return Err(check_bucket_response( + resp, + &self.repo_id(), + NotFoundContext::Entry { path: path.to_string() }, + ) + .await + .unwrap_err()); + } + + let headers = resp.headers(); + + let location = headers + .get("location") + .and_then(|v| v.to_str().ok()) + .map(str::to_owned) + .ok_or_else(|| HFError::Http { + status: resp.status(), + url: url.clone(), + body: "missing Location header".to_string(), + })?; + + let size = headers + .get("x-linked-size") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse::().ok()); + + let xet_hash = headers.get("x-xet-hash").and_then(|v| v.to_str().ok()).map(str::to_owned); + + let etag = headers.get("x-linked-etag").and_then(|v| v.to_str().ok()).map(str::to_owned); + + let last_modified = headers.get("last-modified").and_then(|v| v.to_str().ok()).map(str::to_owned); + + let mut xet_auth_url = None; + let mut xet_reconstruction_url = None; + if let Some(link) = headers.get("link").and_then(|v| v.to_str().ok()) { + for part in link.split(',') { + let part = part.trim(); + if let Some((url_part, rel_part)) = part.split_once(';') { + let u = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); + if rel_part.contains("xet-auth") { + xet_auth_url = Some(u); + } else if rel_part.contains("xet-reconstruction-info") { + xet_reconstruction_url = Some(u); + } + } + } + } + + Ok(ResolvedFile { + url: location, + size, + xet_hash, + etag, + last_modified, + xet_auth_url, + xet_reconstruction_url, + }) + } + + /// Resolves a file path and returns Xet reconstruction metadata. + /// + /// Sends `Accept: application/vnd.xet-fileinfo+json` to request the JSON response + /// instead of a redirect. Use the returned `reconstruction_url` to fetch chunk data + /// from the Xet CAS directly. + #[cfg(feature = "xet")] + pub async fn xet_resolve_file(&self, path: &str) -> Result { + let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.repo, path); + let resp = self + .client + .inner + .client + .get(&url) + .headers(self.client.auth_headers()) + .header("accept", "application/vnd.xet-fileinfo+json") + .send() + .await?; + let resp = + check_bucket_response(resp, &self.repo_id(), NotFoundContext::Entry { path: path.to_string() }).await?; + Ok(resp.json().await?) + } +} + +impl HFClient { + /// Creates a new bucket owned by `namespace`. + pub async fn create_bucket( + &self, + namespace: &str, + repo: &str, + params: CreateBucketParams, + ) -> Result { + let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); + let resp = self + .inner + .client + .post(&url) + .headers(self.auth_headers()) + .json(¶ms) + .send() + .await?; + let repo_id = format!("{}/{}", namespace, repo); + let resp = check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + Ok(resp.json().await?) + } + + /// Returns a paginated stream of all buckets owned by `namespace`. + /// Pagination is driven by `Link` response headers. + pub fn list_buckets(&self, namespace: &str) -> impl futures::Stream> + '_ { + let url = url::Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) + .expect("endpoint is a valid base URL"); + self.paginate(url, vec![], None) + } +} + +#[cfg(test)] +mod tests { + use crate::HFClientBuilder; + + #[test] + fn bucket_constructor_sets_namespace_and_repo() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.namespace, "myuser"); + assert_eq!(bucket.repo, "my-bucket"); + } + + #[test] + fn get_bucket_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!("{}/api/buckets/{}/{}", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + assert!(url.ends_with("/api/buckets/myuser/my-bucket")); + } + + #[test] + fn update_settings_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!("{}/api/buckets/{}/{}/settings", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); + } + + #[test] + fn create_bucket_url() { + let client = HFClientBuilder::new().build().unwrap(); + let url = format!("{}/api/buckets/{}/{}", client.inner.endpoint, "myuser", "new-bucket"); + assert!(url.ends_with("/api/buckets/myuser/new-bucket")); + } + + #[test] + fn list_buckets_url() { + let client = HFClientBuilder::new().build().unwrap(); + let url = format!("{}/api/buckets/{}", client.inner.endpoint, "myuser"); + assert!(url.ends_with("/api/buckets/myuser")); + } + + #[test] + fn batch_files_ndjson_adds_before_deletes() { + use crate::types::{AddFileOp, BatchOp, DeleteFileOp}; + + let ops = vec![ + BatchOp::DeleteFile(DeleteFileOp { + path: "old.parquet".to_string(), + }), + BatchOp::AddFile(AddFileOp { + path: "new.parquet".to_string(), + xet_hash: "abc".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: None, + }), + ]; + let (adds, deletes): (Vec<_>, Vec<_>) = ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + let ndjson: String = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>() + .unwrap(); + let lines: Vec<&str> = ndjson.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].contains("addFile"), "first line must be addFile, got: {}", lines[0]); + assert!(lines[1].contains("deleteFile"), "second line must be deleteFile"); + } + + #[test] + fn batch_files_each_line_ends_with_newline() { + use crate::types::{AddFileOp, BatchOp}; + let ops = vec![BatchOp::AddFile(AddFileOp { + path: "f.parquet".to_string(), + xet_hash: "h".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: None, + })]; + let (adds, deletes): (Vec<_>, Vec<_>) = ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + let ndjson: String = adds + .iter() + .chain(deletes.iter()) + .map(|op| serde_json::to_string(op).map(|s| s + "\n")) + .collect::>() + .unwrap(); + assert!(ndjson.ends_with('\n')); + } + + #[test] + fn list_tree_url_empty_path() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = if "".is_empty() { + format!("{}/api/buckets/{}/{}/tree", bucket.client.inner.endpoint, bucket.namespace, bucket.repo) + } else { + format!( + "{}/api/buckets/{}/{}/tree/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "some/path" + ) + }; + assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree")); + } + + #[test] + fn list_tree_url_with_path() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let path = "data/sub"; + let url = + format!("{}/api/buckets/{}/{}/tree/{}", bucket.client.inner.endpoint, bucket.namespace, bucket.repo, path); + assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree/data/sub")); + } + + #[test] + fn xet_token_urls() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let write_url = format!( + "{}/api/buckets/{}/{}/xet-write-token", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo + ); + let read_url = + format!("{}/api/buckets/{}/{}/xet-read-token", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + assert!(write_url.ends_with("/xet-write-token")); + assert!(read_url.ends_with("/xet-read-token")); + } + + #[test] + fn paths_info_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = + format!("{}/api/buckets/{}/{}/paths-info", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + assert!(url.ends_with("/paths-info")); + } + + #[test] + fn resolve_file_parses_link_header() { + let link = r#"; rel="xet-auth", ; rel="xet-reconstruction-info""#; + let mut xet_auth = None; + let mut xet_reconstruction = None; + for part in link.split(',') { + let part = part.trim(); + if let Some((url_part, rel_part)) = part.split_once(';') { + let url = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); + let rel = rel_part.trim(); + if rel.contains("xet-auth") { + xet_auth = Some(url); + } else if rel.contains("xet-reconstruction-info") { + xet_reconstruction = Some(url); + } + } + } + assert_eq!(xet_auth.unwrap(), "https://auth.example.com/token"); + assert_eq!(xet_reconstruction.unwrap(), "https://xet.example.com/reconstruct/abc"); + } + + #[test] + fn resolve_file_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + ); + assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); + assert!(!url.contains("/api/")); + } + + #[cfg(feature = "xet")] + #[test] + fn xet_resolve_file_url() { + let client = HFClientBuilder::new().build().unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + let url = format!( + "{}/buckets/{}/{}/resolve/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + ); + assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); + } +} diff --git a/huggingface_hub/src/api/mod.rs b/huggingface_hub/src/api/mod.rs index 105ac7e..200817c 100644 --- a/huggingface_hub/src/api/mod.rs +++ b/huggingface_hub/src/api/mod.rs @@ -1,3 +1,4 @@ +pub mod buckets; pub mod commits; pub mod files; pub mod repo; diff --git a/huggingface_hub/src/blocking.rs b/huggingface_hub/src/blocking.rs index fd0bc0b..9881792 100644 --- a/huggingface_hub/src/blocking.rs +++ b/huggingface_hub/src/blocking.rs @@ -63,6 +63,15 @@ pub struct HFSpaceSync { space: repo::HFSpace, } +/// Synchronous handle for Storage Bucket operations. +/// +/// Obtain via [`HFClientSync::bucket`]. All methods block the current thread. +#[derive(Clone)] +pub struct HFBucketSync { + pub(crate) inner: crate::repository::HFBucket, + pub(crate) runtime: Arc, +} + impl fmt::Debug for HFClientSync { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("HFClientSync").finish() @@ -137,6 +146,27 @@ impl HFClientSync { pub fn space(&self, owner: impl Into, name: impl Into) -> HFSpaceSync { HFSpaceSync::new(self.clone(), owner, name) } + + /// Creates a synchronous bucket handle. + pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> HFBucketSync { + HFBucketSync { + inner: self.inner.bucket(namespace, repo), + runtime: self.runtime.clone(), + } + } + + pub fn create_bucket( + &self, + namespace: &str, + repo: &str, + params: crate::types::CreateBucketParams, + ) -> Result { + self.runtime.block_on(self.inner.create_bucket(namespace, repo, params)) + } + + pub fn list_buckets(&self, namespace: &str) -> Result> { + collect_stream(self.runtime.as_ref(), self.inner.list_buckets(namespace)) + } } impl HFRepositorySync { @@ -313,6 +343,49 @@ impl HFRepositorySync { } } +impl HFBucketSync { + pub fn get(&self) -> Result { + self.runtime.block_on(self.inner.get()) + } + + pub fn delete(&self) -> Result<()> { + self.runtime.block_on(self.inner.delete()) + } + + pub fn update_settings(&self, params: crate::types::UpdateBucketParams) -> Result<()> { + self.runtime.block_on(self.inner.update_settings(params)) + } + + pub fn batch_files(&self, ops: Vec) -> Result { + self.runtime.block_on(self.inner.batch_files(ops)) + } + + pub fn list_tree(&self, path: &str, params: crate::types::ListTreeParams) -> Result> { + collect_stream(self.runtime.as_ref(), self.inner.list_tree(path, params)) + } + + pub fn get_paths_info(&self, paths: Vec) -> Result> { + self.runtime.block_on(self.inner.get_paths_info(paths)) + } + + pub fn get_xet_write_token(&self) -> Result { + self.runtime.block_on(self.inner.get_xet_write_token()) + } + + pub fn get_xet_read_token(&self) -> Result { + self.runtime.block_on(self.inner.get_xet_read_token()) + } + + pub fn resolve_file(&self, path: &str) -> Result { + self.runtime.block_on(self.inner.resolve_file(path)) + } + + #[cfg(feature = "xet")] + pub fn xet_resolve_file(&self, path: &str) -> Result { + self.runtime.block_on(self.inner.xet_resolve_file(path)) + } +} + impl HFSpaceSync { /// Creates a blocking space handle for the given owner and name. pub fn new(client: HFClientSync, owner: impl Into, name: impl Into) -> Self { @@ -427,6 +500,18 @@ impl From for HFRepositorySync { /// Alias for [`HFRepositorySync`]. pub type HFRepoSync = HFRepositorySync; +#[cfg(test)] +mod bucket_tests { + #[test] + fn bucket_sync_constructor() { + use crate::HFClientBuilder; + let client = crate::blocking::HFClientSync::from_api(HFClientBuilder::new().build().unwrap()).unwrap(); + let bucket = client.bucket("myuser", "my-bucket"); + assert_eq!(bucket.inner.namespace, "myuser"); + assert_eq!(bucket.inner.repo, "my-bucket"); + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/huggingface_hub/src/error.rs b/huggingface_hub/src/error.rs index 6b507d0..f83108f 100644 --- a/huggingface_hub/src/error.rs +++ b/huggingface_hub/src/error.rs @@ -61,6 +61,28 @@ pub enum HFError { #[error("{0}")] Other(String), + + #[error("forbidden")] + Forbidden, + #[error("conflict: {0}")] + Conflict(String), + #[error("rate limited")] + RateLimited, + #[error("quota exceeded")] + QuotaExceeded, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_error_variants_display() { + assert_eq!(HFError::Forbidden.to_string(), "forbidden"); + assert_eq!(HFError::Conflict("name taken".to_string()).to_string(), "conflict: name taken"); + assert_eq!(HFError::RateLimited.to_string(), "rate limited"); + assert_eq!(HFError::QuotaExceeded.to_string(), "quota exceeded"); + } } impl HFError { diff --git a/huggingface_hub/src/lib.rs b/huggingface_hub/src/lib.rs index 06322d1..e9fdf52 100644 --- a/huggingface_hub/src/lib.rs +++ b/huggingface_hub/src/lib.rs @@ -83,7 +83,7 @@ pub mod types; pub mod xet; #[cfg(feature = "blocking")] -pub use blocking::{HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; +pub use blocking::{HFBucketSync, HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; pub use client::{HFClient, HFClientBuilder}; #[cfg(feature = "cli")] #[doc(hidden)] diff --git a/huggingface_hub/src/repository.rs b/huggingface_hub/src/repository.rs index 570d816..77539ce 100644 --- a/huggingface_hub/src/repository.rs +++ b/huggingface_hub/src/repository.rs @@ -360,12 +360,33 @@ pub struct SpaceVariableDeleteParams { pub key: String, } +/// Handle for operations on a single HuggingFace Storage Bucket. +/// +/// Obtain via [`HFClient::bucket`]. Every method adds `Authorization: Bearer ` +/// using the token configured on the client. +#[derive(Clone)] +pub struct HFBucket { + pub(crate) client: crate::HFClient, + pub namespace: String, + pub repo: String, +} + impl HFClient { /// Create an [`HFRepository`] handle for any repo type. pub fn repo(&self, repo_type: RepoType, owner: impl Into, name: impl Into) -> HFRepository { HFRepository::new(self.clone(), repo_type, owner, name) } + /// Creates a handle for operations on a single Storage Bucket. + /// No I/O is performed. + pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> crate::repository::HFBucket { + crate::repository::HFBucket { + client: self.clone(), + namespace: namespace.into(), + repo: repo.into(), + } + } + /// Create an [`HFRepository`] handle for a model repository. pub fn model(&self, owner: impl Into, name: impl Into) -> HFRepository { self.repo(RepoType::Model, owner, name) diff --git a/huggingface_hub/src/types/buckets.rs b/huggingface_hub/src/types/buckets.rs new file mode 100644 index 0000000..0f9f6e5 --- /dev/null +++ b/huggingface_hub/src/types/buckets.rs @@ -0,0 +1,291 @@ +use serde::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +// --- Parameter types --- + +#[derive(Debug, Clone, TypedBuilder, Serialize)] +pub struct CreateBucketParams { + #[builder(default, setter(strip_option))] + #[serde(skip_serializing_if = "Option::is_none")] + pub private: Option, + #[builder(default, setter(strip_option, into))] + #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] + pub resource_group_id: Option, + #[builder(default)] + #[serde(skip_serializing_if = "Vec::is_empty")] + pub cdn: Vec, +} + +#[derive(Debug, Clone, TypedBuilder, Serialize)] +pub struct UpdateBucketParams { + #[builder(default, setter(strip_option))] + #[serde(skip_serializing_if = "Option::is_none")] + pub private: Option, + #[builder(default, setter(strip_option))] + #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] + pub cdn_regions: Option>, +} + +#[derive(Debug, Clone, TypedBuilder)] +pub struct ListTreeParams { + #[builder(default, setter(strip_option))] + pub limit: Option, + #[builder(default)] + pub recursive: bool, +} + +// --- Response types --- + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketCreated { + pub url: String, + pub name: String, + pub id: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketInfo { + pub id: String, + pub name: String, + pub namespace: String, + pub private: bool, + #[serde(rename = "usedStorage")] + pub used_storage: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + pub cdn: Vec, + pub region: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CdnRegion { + pub provider: String, + pub region: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BucketOverview { + #[serde(rename = "_id")] + pub mongo_id: String, + pub id: String, + pub author: String, + pub private: Option, + #[serde(rename = "repoType")] + pub repo_type: String, + #[serde(rename = "createdAt")] + pub created_at: String, + #[serde(rename = "updatedAt")] + pub updated_at: String, + pub size: u64, + #[serde(rename = "totalFiles")] + pub total_files: u64, + #[serde(rename = "cdnRegions")] + pub cdn_regions: Vec, + #[serde(rename = "resourceGroup")] + pub resource_group: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ResourceGroup { + pub id: String, + pub name: String, + #[serde(rename = "numUsers")] + pub num_users: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct XetToken { + pub token: String, + #[serde(rename = "casUrl")] + pub cas_url: String, + #[serde(rename = "expiresAt")] + pub expires_at: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct PathInfo { + pub path: String, + pub size: u64, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + pub mtime: i64, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct TreeEntry { + #[serde(rename = "type")] + pub entry_type: EntryType, + pub path: String, + pub size: Option, + #[serde(rename = "xetHash")] + pub xet_hash: Option, + #[serde(rename = "contentType")] + pub content_type: Option, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum EntryType { + File, + Directory, +} + +// --- Batch types --- + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "type")] +pub enum BatchOp { + #[serde(rename = "addFile")] + AddFile(AddFileOp), + #[serde(rename = "deleteFile")] + DeleteFile(DeleteFileOp), +} + +#[derive(Debug, Clone, Serialize)] +pub struct AddFileOp { + pub path: String, + #[serde(rename = "xetHash")] + pub xet_hash: String, + #[serde(rename = "contentType")] + pub content_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub mtime: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct DeleteFileOp { + pub path: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BatchResult { + pub success: bool, + pub processed: u32, + pub succeeded: u32, + pub failed: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct BatchFailure { + pub path: String, + pub error: String, +} + +// --- resolve_file types --- + +#[derive(Debug, Clone)] +pub struct ResolvedFile { + pub url: String, + pub size: Option, + pub xet_hash: Option, + pub etag: Option, + pub last_modified: Option, + pub xet_auth_url: Option, + pub xet_reconstruction_url: Option, +} + +// --- xet_resolve_file type (feature = "xet") --- + +#[cfg(feature = "xet")] +#[derive(Debug, Clone, Deserialize)] +pub struct XetFileInfo { + pub hash: String, + #[serde(rename = "refreshUrl")] + pub refresh_url: String, + #[serde(rename = "reconstructionUrl")] + pub reconstruction_url: String, + pub etag: String, + pub size: u64, + #[serde(rename = "contentType")] + pub content_type: String, +} + +// --- Internal pagination helper (not public) --- + +#[derive(Deserialize)] +pub(crate) struct TreePage { + pub entries: Vec, + #[serde(rename = "nextCursor")] + pub next_cursor: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bucket_info_deserializes() { + let json = r#"{ + "id": "my-bucket", + "name": "my-bucket", + "namespace": "myuser", + "private": false, + "usedStorage": 1024, + "totalFiles": 3, + "cdn": [], + "region": "us-east-1" + }"#; + let info: BucketInfo = serde_json::from_str(json).unwrap(); + assert_eq!(info.used_storage, 1024); + assert_eq!(info.total_files, 3); + } + + #[test] + fn bucket_overview_deserializes() { + let json = r#"{ + "_id": "66079f1a2e4b3c001a2b3c4d", + "id": "myuser/my-bucket", + "author": "myuser", + "private": false, + "repoType": "bucket", + "createdAt": "2024-03-30T12:00:00.000Z", + "updatedAt": "2024-03-31T08:30:00.000Z", + "size": 104857600, + "totalFiles": 42, + "cdnRegions": [{"provider": "gcp", "region": "us"}], + "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} + }"#; + let overview: BucketOverview = serde_json::from_str(json).unwrap(); + assert_eq!(overview.id, "myuser/my-bucket"); + assert_eq!(overview.total_files, 42); + assert_eq!(overview.resource_group.unwrap().name, "ml-team"); + } + + #[test] + fn batch_op_serializes_with_type_tag() { + let op = BatchOp::AddFile(AddFileOp { + path: "data/train.parquet".to_string(), + xet_hash: "abc123".to_string(), + content_type: "application/octet-stream".to_string(), + mtime: Some(1711900000), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"addFile""#)); + assert!(s.contains(r#""xetHash":"abc123""#)); + } + + #[test] + fn delete_op_serializes_with_type_tag() { + let op = BatchOp::DeleteFile(DeleteFileOp { + path: "old.parquet".to_string(), + }); + let s = serde_json::to_string(&op).unwrap(); + assert!(s.contains(r#""type":"deleteFile""#)); + } + + #[test] + fn tree_entry_deserializes_file() { + let json = r#"{ + "type": "file", + "path": "data/train.parquet", + "size": 52428800, + "xetHash": "abc123", + "contentType": "application/octet-stream" + }"#; + let entry: TreeEntry = serde_json::from_str(json).unwrap(); + assert!(matches!(entry.entry_type, EntryType::File)); + assert_eq!(entry.xet_hash.unwrap(), "abc123"); + } +} diff --git a/huggingface_hub/src/types/mod.rs b/huggingface_hub/src/types/mod.rs index 86adfdb..3c02f3b 100644 --- a/huggingface_hub/src/types/mod.rs +++ b/huggingface_hub/src/types/mod.rs @@ -1,3 +1,4 @@ +pub mod buckets; pub mod cache; pub mod commit; pub mod params; @@ -7,6 +8,7 @@ pub mod user; #[cfg(feature = "spaces")] pub mod spaces; +pub use buckets::*; pub use commit::*; pub use params::*; pub use repo::*; diff --git a/huggingface_hub/tests/integration_test.rs b/huggingface_hub/tests/integration_test.rs index 49a0b7a..bb93f38 100644 --- a/huggingface_hub/tests/integration_test.rs +++ b/huggingface_hub/tests/integration_test.rs @@ -806,3 +806,115 @@ async fn test_space_secrets_and_variables() { .build(); let _ = api.delete_repo(&delete_params).await; } + +// ---- HFBucket integration tests ---- + +fn test_bucket_name() -> String { + format!( + "test-bucket-{}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ) +} + +#[tokio::test] +async fn test_list_buckets() { + let Some(api) = api() else { return }; + let username = cached_username().await; + let buckets: Vec<_> = api + .list_buckets(username) + .collect::>() + .await + .into_iter() + .collect::>>() + .expect("list_buckets failed"); + let _ = buckets; +} + +#[tokio::test] +async fn test_create_and_delete_bucket() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + let created = api + .create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().private(true).build()) + .await + .expect("create_bucket failed"); + assert!(created.id.contains(&name)); + + let bucket = api.bucket(username, &name); + let info = bucket.get().await.expect("get failed"); + assert_eq!(info.name, name); + assert!(info.private); + + bucket + .update_settings(huggingface_hub::UpdateBucketParams::builder().private(false).build()) + .await + .expect("update_settings failed"); + + let info = bucket.get().await.unwrap(); + assert!(!info.private); + + bucket.delete().await.expect("delete failed"); + + assert!(matches!(bucket.get().await, Err(huggingface_hub::HFError::RepoNotFound { .. }))); +} + +#[tokio::test] +async fn test_bucket_list_tree_empty() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + api.create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().build()) + .await + .expect("create_bucket failed"); + + let bucket = api.bucket(username, &name); + + let entries: Vec<_> = bucket + .list_tree("", huggingface_hub::ListTreeParams::builder().build()) + .collect::>() + .await + .into_iter() + .collect::>>() + .expect("list_tree failed"); + + assert!(entries.is_empty(), "new bucket should have no files"); + + bucket.delete().await.unwrap(); +} + +#[tokio::test] +async fn test_get_xet_write_and_read_token() { + let Some(api) = api() else { return }; + if !write_enabled() { + return; + } + let username = cached_username().await; + let name = test_bucket_name(); + + api.create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().build()) + .await + .unwrap(); + + let bucket = api.bucket(username, &name); + + let write_tok = bucket.get_xet_write_token().await.expect("xet write token failed"); + assert!(!write_tok.token.is_empty()); + assert!(!write_tok.cas_url.is_empty()); + + let read_tok = bucket.get_xet_read_token().await.expect("xet read token failed"); + assert!(!read_tok.token.is_empty()); + + bucket.delete().await.unwrap(); +} From 805e6e450c3274a0b3248fe5aefe7da8466285e1 Mon Sep 17 00:00:00 2001 From: Joseph Godlewski Date: Wed, 8 Apr 2026 16:18:10 -0700 Subject: [PATCH 2/5] Fixing tests and schema --- huggingface_hub/src/api/buckets.rs | 66 +++++++------ huggingface_hub/src/bin/hfrs/main.rs | 14 ++- huggingface_hub/src/blocking.rs | 4 +- huggingface_hub/src/error.rs | 3 - huggingface_hub/src/pagination.rs | 2 +- huggingface_hub/src/types/buckets.rs | 115 ++-------------------- huggingface_hub/tests/integration_test.rs | 31 +++--- 7 files changed, 74 insertions(+), 161 deletions(-) diff --git a/huggingface_hub/src/api/buckets.rs b/huggingface_hub/src/api/buckets.rs index 89e5c2b..ea2bcfd 100644 --- a/huggingface_hub/src/api/buckets.rs +++ b/huggingface_hub/src/api/buckets.rs @@ -1,12 +1,14 @@ use std::collections::VecDeque; use futures::Stream; +use url::Url; use crate::error::{HFError, NotFoundContext}; +use crate::pagination::parse_link_header_next; use crate::repository::HFBucket; use crate::types::{ - BatchOp, BatchResult, BucketCreated, BucketInfo, BucketOverview, CreateBucketParams, ListTreeParams, PathInfo, - ResolvedFile, TreeEntry, TreePage, UpdateBucketParams, XetToken, + BatchOp, BatchResult, BucketCreated, BucketOverview, CreateBucketParams, ListTreeParams, PathInfo, ResolvedFile, + TreeEntry, UpdateBucketParams, XetToken, }; use crate::{HFClient, Result}; @@ -38,7 +40,6 @@ pub(crate) async fn check_bucket_response( }, 409 => HFError::Conflict(body), 429 => HFError::RateLimited, - 507 => HFError::QuotaExceeded, _ => HFError::Http { status, url, body }, }) } @@ -53,7 +54,7 @@ impl HFBucket { } /// Returns metadata about this bucket. - pub async fn get(&self) -> Result { + pub async fn get(&self) -> Result { let resp = self .client .inner @@ -128,42 +129,49 @@ impl HFBucket { /// Uses cursor-in-body pagination: the stream fetches the next page automatically /// when the current page's entries are exhausted. No request is made until the /// first item is polled. - pub fn list_tree(&self, path: &str, params: ListTreeParams) -> impl Stream> + '_ { + pub fn list_tree(&self, path: &str, params: ListTreeParams) -> Result> + '_> { let base_url = if path.is_empty() { format!("{}/api/buckets/{}/{}/tree", self.client.inner.endpoint, self.namespace, self.repo) } else { format!("{}/api/buckets/{}/{}/tree/{}", self.client.inner.endpoint, self.namespace, self.repo, path) }; let repo_id = self.repo_id(); + let mut initial_url = Url::parse(&base_url)?; + { + let mut qp = initial_url.query_pairs_mut(); + if let Some(l) = params.limit { + qp.append_pair("limit", l.to_string().as_str()); + } + if params.recursive { + qp.append_pair("recursive", "true"); + } + qp.finish(); + } - futures::stream::try_unfold( - (VecDeque::::new(), None::, false), - move |(mut pending, cursor, fetched)| { + Ok(futures::stream::try_unfold( + (VecDeque::::new(), Some(initial_url), false), + move |(mut pending, next_url, fetched)| { let client = self.client.clone(); let repo_id = repo_id.clone(); - let base_url = base_url.clone(); async move { if let Some(entry) = pending.pop_front() { - return Ok(Some((entry, (pending, cursor, fetched)))); - } - if fetched && cursor.is_none() { - return Ok(None); - } - let mut req = client.inner.client.get(&base_url).headers(client.auth_headers()); - if let Some(ref c) = cursor { - req = req.query(&[("cursor", c.as_str())]); - } - if let Some(l) = params.limit { - req = req.query(&[("limit", l.to_string().as_str())]); - } - if params.recursive { - req = req.query(&[("recursive", "true")]); + return Ok(Some((entry, (pending, next_url, fetched)))); } + let url = match next_url { + Some(url) => url, + None if fetched => return Ok(None), + None => { + // if !fetched + return Err(HFError::Other("Initial list Url not set".to_string())); + }, + }; + let req = client.inner.client.get(url).headers(client.auth_headers()); let resp = req.send().await?; let resp = check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; - let page: TreePage = resp.json().await?; - let next_cursor = page.next_cursor; - pending.extend(page.entries); + let next_cursor = parse_link_header_next(resp.headers()); + let entries: Vec = resp.json().await?; + + pending.extend(entries); if let Some(entry) = pending.pop_front() { Ok(Some((entry, (pending, next_cursor, true)))) } else { @@ -171,7 +179,7 @@ impl HFBucket { } } }, - ) + )) } /// Returns metadata for a batch of file paths. @@ -347,8 +355,8 @@ impl HFClient { /// Returns a paginated stream of all buckets owned by `namespace`. /// Pagination is driven by `Link` response headers. - pub fn list_buckets(&self, namespace: &str) -> impl futures::Stream> + '_ { - let url = url::Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) + pub fn list_buckets(&self, namespace: &str) -> impl Stream> + '_ { + let url = Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) .expect("endpoint is a valid base URL"); self.paginate(url, vec![], None) } diff --git a/huggingface_hub/src/bin/hfrs/main.rs b/huggingface_hub/src/bin/hfrs/main.rs index 3ae2b97..1c9004f 100644 --- a/huggingface_hub/src/bin/hfrs/main.rs +++ b/huggingface_hub/src/bin/hfrs/main.rs @@ -148,6 +148,17 @@ fn format_hf_error(err: &HFError) -> String { HFError::AuthRequired => { "Not authenticated. Run `hfrs auth login` or set the HF_TOKEN environment variable.".to_string() }, + HFError::Forbidden => { + "Permission denied. Check that your token has the required scopes for this operation.".to_string() + }, + HFError::Conflict(body) => { + if body.contains("already exists") { + "Resource already exists. Use --exist-ok to skip this error.".to_string() + } else { + format!("Conflict: {body}") + } + }, + HFError::RateLimited => "Rate limited. Please wait a moment and try again.".to_string(), HFError::Http { status, url, body } => { let status_code = status.as_u16(); match status_code { @@ -158,9 +169,6 @@ fn format_hf_error(err: &HFError) -> String { } msg }, - 403 => { - "Permission denied. Check that your token has the required scopes for this operation.".to_string() - }, 404 => { format!("Not found: {url}") }, diff --git a/huggingface_hub/src/blocking.rs b/huggingface_hub/src/blocking.rs index 9881792..4283d10 100644 --- a/huggingface_hub/src/blocking.rs +++ b/huggingface_hub/src/blocking.rs @@ -344,7 +344,7 @@ impl HFRepositorySync { } impl HFBucketSync { - pub fn get(&self) -> Result { + pub fn get(&self) -> Result { self.runtime.block_on(self.inner.get()) } @@ -361,7 +361,7 @@ impl HFBucketSync { } pub fn list_tree(&self, path: &str, params: crate::types::ListTreeParams) -> Result> { - collect_stream(self.runtime.as_ref(), self.inner.list_tree(path, params)) + collect_stream(self.runtime.as_ref(), self.inner.list_tree(path, params)?) } pub fn get_paths_info(&self, paths: Vec) -> Result> { diff --git a/huggingface_hub/src/error.rs b/huggingface_hub/src/error.rs index f83108f..83cf5c8 100644 --- a/huggingface_hub/src/error.rs +++ b/huggingface_hub/src/error.rs @@ -68,8 +68,6 @@ pub enum HFError { Conflict(String), #[error("rate limited")] RateLimited, - #[error("quota exceeded")] - QuotaExceeded, } #[cfg(test)] @@ -81,7 +79,6 @@ mod tests { assert_eq!(HFError::Forbidden.to_string(), "forbidden"); assert_eq!(HFError::Conflict("name taken".to_string()).to_string(), "conflict: name taken"); assert_eq!(HFError::RateLimited.to_string(), "rate limited"); - assert_eq!(HFError::QuotaExceeded.to_string(), "quota exceeded"); } } diff --git a/huggingface_hub/src/pagination.rs b/huggingface_hub/src/pagination.rs index b30e8ae..c003a8b 100644 --- a/huggingface_hub/src/pagination.rs +++ b/huggingface_hub/src/pagination.rs @@ -107,7 +107,7 @@ impl HFClient { /// Parse the `Link` header for a `rel="next"` URL. /// Format: `; rel="next"` -fn parse_link_header_next(headers: &HeaderMap) -> Option { +pub(crate) fn parse_link_header_next(headers: &HeaderMap) -> Option { let link_header = headers.get("link")?.to_str().ok()?; for part in link_header.split(',') { diff --git a/huggingface_hub/src/types/buckets.rs b/huggingface_hub/src/types/buckets.rs index 0f9f6e5..4506176 100644 --- a/huggingface_hub/src/types/buckets.rs +++ b/huggingface_hub/src/types/buckets.rs @@ -43,20 +43,6 @@ pub struct BucketCreated { pub id: String, } -#[derive(Debug, Clone, Deserialize)] -pub struct BucketInfo { - pub id: String, - pub name: String, - pub namespace: String, - pub private: bool, - #[serde(rename = "usedStorage")] - pub used_storage: u64, - #[serde(rename = "totalFiles")] - pub total_files: u64, - pub cdn: Vec, - pub region: String, -} - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CdnRegion { pub provider: String, @@ -95,11 +81,13 @@ pub struct ResourceGroup { #[derive(Debug, Clone, Deserialize)] pub struct XetToken { - pub token: String, + #[serde(rename = "accessToken")] + pub access_token: String, #[serde(rename = "casUrl")] pub cas_url: String, - #[serde(rename = "expiresAt")] - pub expires_at: String, + /// Epoch time (s) + #[serde(rename = "exp")] + pub expires_at: u64, } #[derive(Debug, Clone, Deserialize)] @@ -118,6 +106,11 @@ pub struct TreeEntry { #[serde(rename = "type")] pub entry_type: EntryType, pub path: String, + /// ISO 8601 Datetime + #[serde(rename = "uploadedAt")] + pub uploaded_at: String, + /// ISO 8601 Datetime + pub mtime: Option, pub size: Option, #[serde(rename = "xetHash")] pub xet_hash: Option, @@ -201,91 +194,3 @@ pub struct XetFileInfo { #[serde(rename = "contentType")] pub content_type: String, } - -// --- Internal pagination helper (not public) --- - -#[derive(Deserialize)] -pub(crate) struct TreePage { - pub entries: Vec, - #[serde(rename = "nextCursor")] - pub next_cursor: Option, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bucket_info_deserializes() { - let json = r#"{ - "id": "my-bucket", - "name": "my-bucket", - "namespace": "myuser", - "private": false, - "usedStorage": 1024, - "totalFiles": 3, - "cdn": [], - "region": "us-east-1" - }"#; - let info: BucketInfo = serde_json::from_str(json).unwrap(); - assert_eq!(info.used_storage, 1024); - assert_eq!(info.total_files, 3); - } - - #[test] - fn bucket_overview_deserializes() { - let json = r#"{ - "_id": "66079f1a2e4b3c001a2b3c4d", - "id": "myuser/my-bucket", - "author": "myuser", - "private": false, - "repoType": "bucket", - "createdAt": "2024-03-30T12:00:00.000Z", - "updatedAt": "2024-03-31T08:30:00.000Z", - "size": 104857600, - "totalFiles": 42, - "cdnRegions": [{"provider": "gcp", "region": "us"}], - "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} - }"#; - let overview: BucketOverview = serde_json::from_str(json).unwrap(); - assert_eq!(overview.id, "myuser/my-bucket"); - assert_eq!(overview.total_files, 42); - assert_eq!(overview.resource_group.unwrap().name, "ml-team"); - } - - #[test] - fn batch_op_serializes_with_type_tag() { - let op = BatchOp::AddFile(AddFileOp { - path: "data/train.parquet".to_string(), - xet_hash: "abc123".to_string(), - content_type: "application/octet-stream".to_string(), - mtime: Some(1711900000), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"addFile""#)); - assert!(s.contains(r#""xetHash":"abc123""#)); - } - - #[test] - fn delete_op_serializes_with_type_tag() { - let op = BatchOp::DeleteFile(DeleteFileOp { - path: "old.parquet".to_string(), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"deleteFile""#)); - } - - #[test] - fn tree_entry_deserializes_file() { - let json = r#"{ - "type": "file", - "path": "data/train.parquet", - "size": 52428800, - "xetHash": "abc123", - "contentType": "application/octet-stream" - }"#; - let entry: TreeEntry = serde_json::from_str(json).unwrap(); - assert!(matches!(entry.entry_type, EntryType::File)); - assert_eq!(entry.xet_hash.unwrap(), "abc123"); - } -} diff --git a/huggingface_hub/tests/integration_test.rs b/huggingface_hub/tests/integration_test.rs index bb93f38..9aad475 100644 --- a/huggingface_hub/tests/integration_test.rs +++ b/huggingface_hub/tests/integration_test.rs @@ -810,13 +810,7 @@ async fn test_space_secrets_and_variables() { // ---- HFBucket integration tests ---- fn test_bucket_name() -> String { - format!( - "test-bucket-{}", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_millis() - ) + format!("test-bucket-{}", uuid_v4_short()) } #[tokio::test] @@ -843,23 +837,23 @@ async fn test_create_and_delete_bucket() { let name = test_bucket_name(); let created = api - .create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().private(true).build()) + .create_bucket(username, &name, CreateBucketParams::builder().private(true).build()) .await .expect("create_bucket failed"); - assert!(created.id.contains(&name)); + assert!(created.name.contains(&name)); let bucket = api.bucket(username, &name); let info = bucket.get().await.expect("get failed"); - assert_eq!(info.name, name); - assert!(info.private); + assert_eq!(info.id, format!("{username}/{name}")); + assert!(info.private.unwrap()); bucket - .update_settings(huggingface_hub::UpdateBucketParams::builder().private(false).build()) + .update_settings(UpdateBucketParams::builder().private(false).build()) .await .expect("update_settings failed"); let info = bucket.get().await.unwrap(); - assert!(!info.private); + assert!(!info.private.unwrap()); bucket.delete().await.expect("delete failed"); @@ -875,14 +869,15 @@ async fn test_bucket_list_tree_empty() { let username = cached_username().await; let name = test_bucket_name(); - api.create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().build()) + api.create_bucket(username, &name, CreateBucketParams::builder().build()) .await .expect("create_bucket failed"); let bucket = api.bucket(username, &name); let entries: Vec<_> = bucket - .list_tree("", huggingface_hub::ListTreeParams::builder().build()) + .list_tree("", ListTreeParams::builder().build()) + .unwrap() .collect::>() .await .into_iter() @@ -903,18 +898,18 @@ async fn test_get_xet_write_and_read_token() { let username = cached_username().await; let name = test_bucket_name(); - api.create_bucket(username, &name, huggingface_hub::CreateBucketParams::builder().build()) + api.create_bucket(username, &name, CreateBucketParams::builder().build()) .await .unwrap(); let bucket = api.bucket(username, &name); let write_tok = bucket.get_xet_write_token().await.expect("xet write token failed"); - assert!(!write_tok.token.is_empty()); + assert!(!write_tok.access_token.is_empty()); assert!(!write_tok.cas_url.is_empty()); let read_tok = bucket.get_xet_read_token().await.expect("xet read token failed"); - assert!(!read_tok.token.is_empty()); + assert!(!read_tok.access_token.is_empty()); bucket.delete().await.unwrap(); } From eb32937ce28af5e2dbb84ca36afe9683df17adc6 Mon Sep 17 00:00:00 2001 From: Joseph Godlewski Date: Wed, 8 Apr 2026 16:32:24 -0700 Subject: [PATCH 3/5] rm plans --- .../plans/2026-04-08-hf-bucket-rust-client.md | 2015 ----------------- ...2026-04-08-hf-bucket-rust-client-design.md | 437 ---- 2 files changed, 2452 deletions(-) delete mode 100644 docs/plans/2026-04-08-hf-bucket-rust-client.md delete mode 100644 docs/specs/2026-04-08-hf-bucket-rust-client-design.md diff --git a/docs/plans/2026-04-08-hf-bucket-rust-client.md b/docs/plans/2026-04-08-hf-bucket-rust-client.md deleted file mode 100644 index 2da48b1..0000000 --- a/docs/plans/2026-04-08-hf-bucket-rust-client.md +++ /dev/null @@ -1,2015 +0,0 @@ -# HFBucket Rust Client Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add an `HFBucket` type and supporting infrastructure to `huggingface_hub_rust` that exposes the full HuggingFace Storage Buckets API as a typed async Rust client. - -**Architecture:** `HFBucket` is a standalone handle type (following the `HFSpace` precedent) holding an `HFClient` reference plus namespace and repo strings. Bucket methods are implemented in `api/buckets.rs`; types live in `types/buckets.rs`. A private `check_bucket_response` helper maps HTTP status codes — including four new `HFError` variants — for all bucket endpoints. - -**Tech Stack:** Rust, `reqwest` 0.13, `serde`/`serde_json`, `typed-builder`, `futures` (`try_unfold`), `tokio` - -**Spec:** `docs/specs/2026-04-08-hf-bucket-rust-client-design.md` in `huggingface/xet-catalogue` -**Target repo:** `/Users/jgodlew/git/huggingface/huggingface_hub_rust/` - ---- - -## File Map - -| Action | Path | -|--------|------| -| Create | `huggingface_hub/src/types/buckets.rs` | -| Create | `huggingface_hub/src/api/buckets.rs` | -| Modify | `huggingface_hub/src/error.rs` — add 4 new `HFError` variants | -| Modify | `huggingface_hub/src/types/mod.rs` — add `pub mod buckets; pub use buckets::*;` | -| Modify | `huggingface_hub/src/api/mod.rs` — add `pub mod buckets;` | -| Modify | `huggingface_hub/src/repository.rs` — add `HFBucket` struct | -| Modify | `huggingface_hub/src/client.rs` — add `bucket()`, `create_bucket()`, `list_buckets()` | -| Modify | `huggingface_hub/src/lib.rs` — export `HFBucket` and `HFBucketSync` | -| Modify | `huggingface_hub/src/blocking.rs` — add `HFBucketSync` and blocking wrappers | -| Modify | `huggingface_hub/tests/integration_test.rs` — add integration tests | - ---- - -## Task 1: Add new `HFError` variants - -**Files:** -- Modify: `huggingface_hub/src/error.rs` - -- [ ] **Step 1: Write a failing test that matches on the new variants** - -Add to the bottom of `huggingface_hub/src/error.rs`: - -```rust -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn new_error_variants_display() { - assert_eq!(HFError::Forbidden.to_string(), "forbidden"); - assert_eq!( - HFError::Conflict("name taken".to_string()).to_string(), - "conflict: name taken" - ); - assert_eq!(HFError::RateLimited.to_string(), "rate limited"); - assert_eq!(HFError::QuotaExceeded.to_string(), "quota exceeded"); - } -} -``` - -- [ ] **Step 2: Run the test to confirm it fails** - -```bash -cd /Users/jgodlew/git/huggingface/huggingface_hub_rust -cargo test -p huggingface_hub new_error_variants_display 2>&1 -``` - -Expected: compile error — `HFError::Forbidden` does not exist. - -- [ ] **Step 3: Add the four variants to `HFError`** - -In `huggingface_hub/src/error.rs`, locate the `HFError` enum and add after the last existing variant (before the closing `}`): - -```rust - #[error("forbidden")] - Forbidden, - #[error("conflict: {0}")] - Conflict(String), - #[error("rate limited")] - RateLimited, - #[error("quota exceeded")] - QuotaExceeded, -``` - -- [ ] **Step 4: Run the test to confirm it passes** - -```bash -cargo test -p huggingface_hub new_error_variants_display 2>&1 -``` - -Expected: `test error::tests::new_error_variants_display ... ok` - -- [ ] **Step 5: Commit** - -```bash -cd /Users/jgodlew/git/huggingface/huggingface_hub_rust -git add huggingface_hub/src/error.rs -git commit -m "feat(error): add Forbidden, Conflict, RateLimited, QuotaExceeded variants" -``` - ---- - -## Task 2: Create `types/buckets.rs` and wire it in - -**Files:** -- Create: `huggingface_hub/src/types/buckets.rs` -- Modify: `huggingface_hub/src/types/mod.rs` - -- [ ] **Step 1: Write a failing test for type deserialization** - -Create `huggingface_hub/src/types/buckets.rs` with only the test module: - -```rust -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bucket_info_deserializes() { - let json = r#"{ - "id": "my-bucket", - "name": "my-bucket", - "namespace": "myuser", - "private": false, - "usedStorage": 1024, - "totalFiles": 3, - "cdn": [], - "region": "us-east-1" - }"#; - let info: BucketInfo = serde_json::from_str(json).unwrap(); - assert_eq!(info.used_storage, 1024); - assert_eq!(info.total_files, 3); - } - - #[test] - fn bucket_overview_deserializes() { - let json = r#"{ - "_id": "66079f1a2e4b3c001a2b3c4d", - "id": "myuser/my-bucket", - "author": "myuser", - "private": false, - "repoType": "bucket", - "createdAt": "2024-03-30T12:00:00.000Z", - "updatedAt": "2024-03-31T08:30:00.000Z", - "size": 104857600, - "totalFiles": 42, - "cdnRegions": [{"provider": "gcp", "region": "us"}], - "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} - }"#; - let overview: BucketOverview = serde_json::from_str(json).unwrap(); - assert_eq!(overview.id, "myuser/my-bucket"); - assert_eq!(overview.total_files, 42); - assert_eq!(overview.resource_group.unwrap().name, "ml-team"); - } - - #[test] - fn batch_op_serializes_with_type_tag() { - let op = BatchOp::AddFile(AddFileOp { - path: "data/train.parquet".to_string(), - xet_hash: "abc123".to_string(), - content_type: "application/octet-stream".to_string(), - mtime: Some(1711900000), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"addFile""#)); - assert!(s.contains(r#""xetHash":"abc123""#)); - } - - #[test] - fn delete_op_serializes_with_type_tag() { - let op = BatchOp::DeleteFile(DeleteFileOp { - path: "old.parquet".to_string(), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"deleteFile""#)); - } - - #[test] - fn tree_entry_deserializes_file() { - let json = r#"{ - "type": "file", - "path": "data/train.parquet", - "size": 52428800, - "xetHash": "abc123", - "contentType": "application/octet-stream" - }"#; - let entry: TreeEntry = serde_json::from_str(json).unwrap(); - assert!(matches!(entry.entry_type, EntryType::File)); - assert_eq!(entry.xet_hash.unwrap(), "abc123"); - } -} -``` - -- [ ] **Step 2: Run the test to confirm it fails** - -```bash -cargo test -p huggingface_hub bucket_info_deserializes 2>&1 -``` - -Expected: compile error — `BucketInfo` not found. - -- [ ] **Step 3: Add all types above the test module in `types/buckets.rs`** - -Replace the contents of `huggingface_hub/src/types/buckets.rs` with: - -```rust -use serde::{Deserialize, Serialize}; -use typed_builder::TypedBuilder; - -// --- Parameter types --- - -#[derive(Debug, Clone, TypedBuilder, Serialize)] -pub struct CreateBucketParams { - #[builder(default, setter(strip_option))] - #[serde(skip_serializing_if = "Option::is_none")] - pub private: Option, - #[builder(default, setter(strip_option, into))] - #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] - pub resource_group_id: Option, - #[builder(default)] - #[serde(skip_serializing_if = "Vec::is_empty")] - pub cdn: Vec, -} - -#[derive(Debug, Clone, TypedBuilder, Serialize)] -pub struct UpdateBucketParams { - #[builder(default, setter(strip_option))] - #[serde(skip_serializing_if = "Option::is_none")] - pub private: Option, - #[builder(default, setter(strip_option))] - #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] - pub cdn_regions: Option>, -} - -#[derive(Debug, Clone, TypedBuilder)] -pub struct ListTreeParams { - #[builder(default, setter(strip_option))] - pub limit: Option, - #[builder(default)] - pub recursive: bool, -} - -// --- Response types --- - -#[derive(Debug, Clone, Deserialize)] -pub struct BucketCreated { - pub url: String, - pub name: String, - pub id: String, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct BucketInfo { - pub id: String, - pub name: String, - pub namespace: String, - pub private: bool, - #[serde(rename = "usedStorage")] - pub used_storage: u64, - #[serde(rename = "totalFiles")] - pub total_files: u64, - pub cdn: Vec, - pub region: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CdnRegion { - pub provider: String, - pub region: String, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct BucketOverview { - #[serde(rename = "_id")] - pub mongo_id: String, - pub id: String, - pub author: String, - pub private: Option, - #[serde(rename = "repoType")] - pub repo_type: String, - #[serde(rename = "createdAt")] - pub created_at: String, - #[serde(rename = "updatedAt")] - pub updated_at: String, - pub size: u64, - #[serde(rename = "totalFiles")] - pub total_files: u64, - #[serde(rename = "cdnRegions")] - pub cdn_regions: Vec, - #[serde(rename = "resourceGroup")] - pub resource_group: Option, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct ResourceGroup { - pub id: String, - pub name: String, - #[serde(rename = "numUsers")] - pub num_users: Option, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct XetToken { - pub token: String, - #[serde(rename = "casUrl")] - pub cas_url: String, - #[serde(rename = "expiresAt")] - pub expires_at: String, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct PathInfo { - pub path: String, - pub size: u64, - #[serde(rename = "xetHash")] - pub xet_hash: String, - #[serde(rename = "contentType")] - pub content_type: String, - pub mtime: i64, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct TreeEntry { - #[serde(rename = "type")] - pub entry_type: EntryType, - pub path: String, - pub size: Option, - #[serde(rename = "xetHash")] - pub xet_hash: Option, - #[serde(rename = "contentType")] - pub content_type: Option, -} - -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum EntryType { - File, - Directory, -} - -// --- Batch types --- - -#[derive(Debug, Clone, Serialize)] -#[serde(tag = "type")] -pub enum BatchOp { - #[serde(rename = "addFile")] - AddFile(AddFileOp), - #[serde(rename = "deleteFile")] - DeleteFile(DeleteFileOp), -} - -#[derive(Debug, Clone, Serialize)] -pub struct AddFileOp { - pub path: String, - #[serde(rename = "xetHash")] - pub xet_hash: String, - #[serde(rename = "contentType")] - pub content_type: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub mtime: Option, -} - -#[derive(Debug, Clone, Serialize)] -pub struct DeleteFileOp { - pub path: String, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct BatchResult { - pub success: bool, - pub processed: u32, - pub succeeded: u32, - pub failed: Vec, -} - -#[derive(Debug, Clone, Deserialize)] -pub struct BatchFailure { - pub path: String, - pub error: String, -} - -// --- resolve_file types --- - -#[derive(Debug, Clone)] -pub struct ResolvedFile { - pub url: String, - pub size: Option, - pub xet_hash: Option, - pub etag: Option, - pub last_modified: Option, - pub xet_auth_url: Option, - pub xet_reconstruction_url: Option, -} - -// --- xet_resolve_file type (feature = "xet") --- - -#[cfg(feature = "xet")] -#[derive(Debug, Clone, Deserialize)] -pub struct XetFileInfo { - pub hash: String, - #[serde(rename = "refreshUrl")] - pub refresh_url: String, - #[serde(rename = "reconstructionUrl")] - pub reconstruction_url: String, - pub etag: String, - pub size: u64, - #[serde(rename = "contentType")] - pub content_type: String, -} - -// --- Internal pagination helper (not public) --- - -#[derive(Deserialize)] -pub(crate) struct TreePage { - pub entries: Vec, - #[serde(rename = "nextCursor")] - pub next_cursor: Option, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bucket_info_deserializes() { - let json = r#"{ - "id": "my-bucket", - "name": "my-bucket", - "namespace": "myuser", - "private": false, - "usedStorage": 1024, - "totalFiles": 3, - "cdn": [], - "region": "us-east-1" - }"#; - let info: BucketInfo = serde_json::from_str(json).unwrap(); - assert_eq!(info.used_storage, 1024); - assert_eq!(info.total_files, 3); - } - - #[test] - fn bucket_overview_deserializes() { - let json = r#"{ - "_id": "66079f1a2e4b3c001a2b3c4d", - "id": "myuser/my-bucket", - "author": "myuser", - "private": false, - "repoType": "bucket", - "createdAt": "2024-03-30T12:00:00.000Z", - "updatedAt": "2024-03-31T08:30:00.000Z", - "size": 104857600, - "totalFiles": 42, - "cdnRegions": [{"provider": "gcp", "region": "us"}], - "resourceGroup": {"id": "abc", "name": "ml-team", "numUsers": 5} - }"#; - let overview: BucketOverview = serde_json::from_str(json).unwrap(); - assert_eq!(overview.id, "myuser/my-bucket"); - assert_eq!(overview.total_files, 42); - assert_eq!(overview.resource_group.unwrap().name, "ml-team"); - } - - #[test] - fn batch_op_serializes_with_type_tag() { - let op = BatchOp::AddFile(AddFileOp { - path: "data/train.parquet".to_string(), - xet_hash: "abc123".to_string(), - content_type: "application/octet-stream".to_string(), - mtime: Some(1711900000), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"addFile""#)); - assert!(s.contains(r#""xetHash":"abc123""#)); - } - - #[test] - fn delete_op_serializes_with_type_tag() { - let op = BatchOp::DeleteFile(DeleteFileOp { - path: "old.parquet".to_string(), - }); - let s = serde_json::to_string(&op).unwrap(); - assert!(s.contains(r#""type":"deleteFile""#)); - } - - #[test] - fn tree_entry_deserializes_file() { - let json = r#"{ - "type": "file", - "path": "data/train.parquet", - "size": 52428800, - "xetHash": "abc123", - "contentType": "application/octet-stream" - }"#; - let entry: TreeEntry = serde_json::from_str(json).unwrap(); - assert!(matches!(entry.entry_type, EntryType::File)); - assert_eq!(entry.xet_hash.unwrap(), "abc123"); - } -} -``` - -- [ ] **Step 4: Wire into `types/mod.rs`** - -In `huggingface_hub/src/types/mod.rs`, add alongside the existing module declarations: - -```rust -pub mod buckets; -``` - -And add to the re-exports at the bottom: - -```rust -pub use buckets::*; -``` - -- [ ] **Step 5: Run the tests to confirm they pass** - -```bash -cargo test -p huggingface_hub bucket_info_deserializes batch_op_serializes tree_entry_deserializes bucket_overview_deserializes delete_op_serializes 2>&1 -``` - -Expected: all 5 tests pass. - -- [ ] **Step 6: Commit** - -```bash -git add huggingface_hub/src/types/buckets.rs huggingface_hub/src/types/mod.rs -git commit -m "feat(types): add bucket types (BucketInfo, BucketOverview, BatchOp, TreeEntry, etc.)" -``` - ---- - -## Task 3: Add `HFBucket` struct and wire up modules - -**Files:** -- Modify: `huggingface_hub/src/repository.rs` -- Create (skeleton): `huggingface_hub/src/api/buckets.rs` -- Modify: `huggingface_hub/src/api/mod.rs` -- Modify: `huggingface_hub/src/client.rs` -- Modify: `huggingface_hub/src/lib.rs` - -- [ ] **Step 1: Write a failing test for `HFClient::bucket()` constructor** - -In `huggingface_hub/src/api/buckets.rs` (new file, skeleton only for now): - -```rust -#[cfg(test)] -mod tests { - use crate::HFClientBuilder; - - #[test] - fn bucket_constructor_sets_namespace_and_repo() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - assert_eq!(bucket.namespace, "myuser"); - assert_eq!(bucket.repo, "my-bucket"); - } -} -``` - -- [ ] **Step 2: Run the test to confirm it fails** - -```bash -cargo test -p huggingface_hub bucket_constructor_sets_namespace_and_repo 2>&1 -``` - -Expected: compile error — `client.bucket` does not exist. - -- [ ] **Step 3: Add `HFBucket` struct to `repository.rs`** - -In `huggingface_hub/src/repository.rs`, add after the `HFSpace` struct definition (or after `HFRepository`, following the same pattern): - -```rust -/// Handle for operations on a single HuggingFace Storage Bucket. -/// -/// Obtain via [`HFClient::bucket`]. Every method adds `Authorization: Bearer ` -/// using the token configured on the client. -#[derive(Clone)] -pub struct HFBucket { - pub(crate) client: crate::HFClient, - pub namespace: String, - pub repo: String, -} -``` - -- [ ] **Step 4: Add `HFClient::bucket()` to `client.rs`** - -In `huggingface_hub/src/client.rs`, add alongside the existing `model()`, `dataset()`, `space()` methods: - -```rust -/// Creates a handle for operations on a single Storage Bucket. -/// No I/O is performed. -pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> crate::repository::HFBucket { - crate::repository::HFBucket { - client: self.clone(), - namespace: namespace.into(), - repo: repo.into(), - } -} -``` - -- [ ] **Step 5: Wire `api/buckets.rs` into `api/mod.rs`** - -In `huggingface_hub/src/api/mod.rs`, add: - -```rust -pub mod buckets; -``` - -- [ ] **Step 6: Export `HFBucket` from `lib.rs`** - -In `huggingface_hub/src/lib.rs`, ensure `HFBucket` is included in the `repository` re-export. It will be exported automatically if `lib.rs` already has `pub use repository::*;`. Verify this line exists; if not, add it. - -- [ ] **Step 7: Run the test to confirm it passes** - -```bash -cargo test -p huggingface_hub bucket_constructor_sets_namespace_and_repo 2>&1 -``` - -Expected: `test api::buckets::tests::bucket_constructor_sets_namespace_and_repo ... ok` - -- [ ] **Step 8: Commit** - -```bash -git add huggingface_hub/src/repository.rs huggingface_hub/src/api/buckets.rs \ - huggingface_hub/src/api/mod.rs huggingface_hub/src/client.rs \ - huggingface_hub/src/lib.rs -git commit -m "feat(bucket): add HFBucket struct and client.bucket() constructor" -``` - ---- - -## Task 4: Bucket CRUD — `get`, `delete`, `update_settings` - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write failing tests** - -Add to the `tests` module in `huggingface_hub/src/api/buckets.rs`: - -```rust -#[cfg(test)] -mod tests { - use crate::HFClientBuilder; - - #[test] - fn bucket_constructor_sets_namespace_and_repo() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - assert_eq!(bucket.namespace, "myuser"); - assert_eq!(bucket.repo, "my-bucket"); - } - - #[test] - fn get_bucket_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = format!( - "{}/api/buckets/{}/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(url.ends_with("/api/buckets/myuser/my-bucket")); - } - - #[test] - fn update_settings_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = format!( - "{}/api/buckets/{}/{}/settings", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); - } -} -``` - -- [ ] **Step 2: Run the tests to confirm they fail** - -```bash -cargo test -p huggingface_hub get_bucket_url update_settings_url 2>&1 -``` - -Expected: compile error — `bucket.client.inner` not accessible. - -- [ ] **Step 3: Add `check_bucket_response` helper and implement CRUD methods** - -Replace `huggingface_hub/src/api/buckets.rs` with: - -```rust -use std::collections::VecDeque; - -use futures::{Stream, StreamExt}; - -use crate::error::{HFError, NotFoundContext}; -use crate::repository::HFBucket; -use crate::types::{ - BatchOp, BatchResult, BucketCreated, BucketInfo, BucketOverview, CreateBucketParams, - ListTreeParams, PathInfo, ResolvedFile, TreeEntry, TreePage, UpdateBucketParams, XetToken, -}; -use crate::{HFClient, Result}; - -/// Maps HTTP status codes to `HFError` variants for bucket API responses. -/// Bucket-level 404s map to `RepoNotFound`; file-level 404s map to `EntryNotFound`. -async fn check_bucket_response( - response: reqwest::Response, - repo_id: &str, - not_found_ctx: NotFoundContext, -) -> Result { - if response.status().is_success() { - return Ok(response); - } - let status = response.status().as_u16(); - let url = response.url().to_string(); - let body = response.text().await.unwrap_or_default(); - Err(match status { - 401 => HFError::AuthRequired, - 403 => HFError::Forbidden, - 404 => match not_found_ctx { - NotFoundContext::Repo => HFError::RepoNotFound { - repo_id: repo_id.to_string(), - }, - NotFoundContext::Entry { path } => HFError::EntryNotFound { - path, - repo_id: repo_id.to_string(), - }, - _ => HFError::Http { status, url, body }, - }, - 409 => HFError::Conflict(body), - 429 => HFError::RateLimited, - 507 => HFError::QuotaExceeded, - _ => HFError::Http { status, url, body }, - }) -} - -impl HFBucket { - fn repo_id(&self) -> String { - format!("{}/{}", self.namespace, self.repo) - } - - fn bucket_url(&self) -> String { - format!( - "{}/api/buckets/{}/{}", - self.client.inner.endpoint, self.namespace, self.repo - ) - } - - /// Returns metadata about this bucket. - pub async fn get(&self) -> Result { - let resp = self - .client - .inner - .client - .get(self.bucket_url()) - .headers(self.client.auth_headers()) - .send() - .await - .map_err(HFError::Request)?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - resp.json().await.map_err(HFError::Json) - } - - /// Permanently deletes this bucket and all its files. - pub async fn delete(&self) -> Result<()> { - let resp = self - .client - .inner - .client - .delete(self.bucket_url()) - .headers(self.client.auth_headers()) - .send() - .await - .map_err(HFError::Request)?; - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - Ok(()) - } - - /// Updates visibility or CDN configuration for this bucket. - pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> { - let resp = self - .client - .inner - .client - .put(format!("{}/settings", self.bucket_url())) - .headers(self.client.auth_headers()) - .json(¶ms) - .send() - .await - .map_err(HFError::Request)?; - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use crate::HFClientBuilder; - - #[test] - fn bucket_constructor_sets_namespace_and_repo() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - assert_eq!(bucket.namespace, "myuser"); - assert_eq!(bucket.repo, "my-bucket"); - } - - #[test] - fn get_bucket_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = format!( - "{}/api/buckets/{}/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(url.ends_with("/api/buckets/myuser/my-bucket")); - } - - #[test] - fn update_settings_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = format!( - "{}/api/buckets/{}/{}/settings", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); - } -} -``` - -- [ ] **Step 4: Run the tests to confirm they pass** - -```bash -cargo test -p huggingface_hub get_bucket_url update_settings_url bucket_constructor 2>&1 -``` - -Expected: all 3 tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): add get, delete, update_settings with check_bucket_response helper" -``` - ---- - -## Task 5: `HFClient::create_bucket` and `HFClient::list_buckets` - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` -- Modify: `huggingface_hub/src/client.rs` - -- [ ] **Step 1: Write failing tests** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[test] - fn create_bucket_url() { - let client = HFClientBuilder::new().build().unwrap(); - let url = format!( - "{}/api/buckets/{}/{}", - client.inner.endpoint, "myuser", "new-bucket" - ); - assert!(url.ends_with("/api/buckets/myuser/new-bucket")); - } - - #[test] - fn list_buckets_url() { - let client = HFClientBuilder::new().build().unwrap(); - let url = format!("{}/api/buckets/{}", client.inner.endpoint, "myuser"); - assert!(url.ends_with("/api/buckets/myuser")); - } -``` - -- [ ] **Step 2: Run the tests to confirm they fail** - -```bash -cargo test -p huggingface_hub create_bucket_url list_buckets_url 2>&1 -``` - -Expected: compile error — `client.inner` not accessible from test or methods not found. - -- [ ] **Step 3: Add `create_bucket` and `list_buckets` to `client.rs`** - -In `huggingface_hub/src/client.rs`, add the following imports at the top if not already present: - -```rust -use url::Url; -``` - -Then add the new methods on `HFClient` (alongside `bucket()`): - -```rust -/// Creates a new bucket owned by `namespace`. -pub async fn create_bucket( - &self, - namespace: &str, - repo: &str, - params: crate::types::CreateBucketParams, -) -> crate::Result { - let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); - let resp = self - .inner - .client - .post(&url) - .headers(self.auth_headers()) - .json(¶ms) - .send() - .await - .map_err(crate::HFError::Request)?; - let repo_id = format!("{}/{}", namespace, repo); - let resp = crate::api::buckets::check_bucket_response( - resp, - &repo_id, - crate::error::NotFoundContext::Repo, - ) - .await?; - resp.json().await.map_err(crate::HFError::Json) -} - -/// Returns a paginated stream of all buckets owned by `namespace`. -/// Pagination is driven by `Link` response headers. -pub fn list_buckets( - &self, - namespace: &str, -) -> impl futures::Stream> + '_ { - let url = Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) - .expect("endpoint is a valid base URL"); - self.paginate(url, vec![], None) -} -``` - -Note: `check_bucket_response` needs to be `pub(crate)` in `api/buckets.rs`. Change its visibility there: - -```rust -pub(crate) async fn check_bucket_response( ... ) -``` - -- [ ] **Step 4: Run the tests to confirm they pass** - -```bash -cargo test -p huggingface_hub create_bucket_url list_buckets_url 2>&1 -``` - -Expected: both tests pass. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs huggingface_hub/src/client.rs -git commit -m "feat(bucket): add HFClient::create_bucket and list_buckets" -``` - ---- - -## Task 6: `batch_files` — NDJSON serialization - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write failing tests** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[test] - fn batch_files_ndjson_adds_before_deletes() { - use crate::types::{AddFileOp, BatchOp, DeleteFileOp}; - - let ops = vec![ - BatchOp::DeleteFile(DeleteFileOp { path: "old.parquet".to_string() }), - BatchOp::AddFile(AddFileOp { - path: "new.parquet".to_string(), - xet_hash: "abc".to_string(), - content_type: "application/octet-stream".to_string(), - mtime: None, - }), - ]; - // Partition and serialize: adds must come first regardless of input order - let (adds, deletes): (Vec<_>, Vec<_>) = - ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); - let ndjson: String = adds - .iter() - .chain(deletes.iter()) - .map(|op| serde_json::to_string(op).map(|s| s + "\n")) - .collect::>() - .unwrap(); - let lines: Vec<&str> = ndjson.lines().collect(); - assert_eq!(lines.len(), 2); - assert!(lines[0].contains("addFile"), "first line must be addFile, got: {}", lines[0]); - assert!(lines[1].contains("deleteFile"), "second line must be deleteFile"); - } - - #[test] - fn batch_files_each_line_ends_with_newline() { - use crate::types::{AddFileOp, BatchOp}; - let ops = vec![BatchOp::AddFile(AddFileOp { - path: "f.parquet".to_string(), - xet_hash: "h".to_string(), - content_type: "application/octet-stream".to_string(), - mtime: None, - })]; - let (adds, deletes): (Vec<_>, Vec<_>) = - ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); - let ndjson: String = adds - .iter() - .chain(deletes.iter()) - .map(|op| serde_json::to_string(op).map(|s| s + "\n")) - .collect::>() - .unwrap(); - assert!(ndjson.ends_with('\n')); - } -``` - -- [ ] **Step 2: Run the tests to confirm they pass (logic is already testable)** - -```bash -cargo test -p huggingface_hub batch_files_ndjson batch_files_each_line 2>&1 -``` - -These tests only exercise the serialization logic which uses already-present types. They should compile and pass. If they don't compile, check that `BatchOp`, `AddFileOp`, `DeleteFileOp` are in scope. - -- [ ] **Step 3: Implement `batch_files` on `HFBucket`** - -Add the following method to the `impl HFBucket` block in `api/buckets.rs`: - -```rust - /// Adds and/or removes files in a single atomic operation. - /// - /// All `AddFile` operations are sent before `DeleteFile` operations, as required - /// by the batch protocol. The input order within each group is preserved. - pub async fn batch_files(&self, ops: Vec) -> Result { - let (adds, deletes): (Vec<_>, Vec<_>) = - ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); - - let ndjson = adds - .iter() - .chain(deletes.iter()) - .map(|op| serde_json::to_string(op).map(|s| s + "\n")) - .collect::>() - .map_err(HFError::Json)?; - - let resp = self - .client - .inner - .client - .post(format!("{}/batch", self.bucket_url())) - .headers(self.client.auth_headers()) - .header("content-type", "application/x-ndjson") - .body(ndjson) - .send() - .await - .map_err(HFError::Request)?; - - let resp = - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - resp.json().await.map_err(HFError::Json) - } -``` - -Also add `use serde_json;` at the top of `api/buckets.rs` if not already present. - -- [ ] **Step 4: Run all bucket tests** - -```bash -cargo test -p huggingface_hub batch_files 2>&1 -``` - -Expected: both tests pass, no compile errors. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): implement batch_files with NDJSON add-before-delete ordering" -``` - ---- - -## Task 7: `list_tree` — cursor-in-body streaming pagination - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write a failing test for URL construction** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[test] - fn list_tree_url_empty_path() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = if "".is_empty() { - format!( - "{}/api/buckets/{}/{}/tree", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ) - } else { - format!( - "{}/api/buckets/{}/{}/tree/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "some/path" - ) - }; - assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree")); - } - - #[test] - fn list_tree_url_with_path() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let path = "data/sub"; - let url = format!( - "{}/api/buckets/{}/{}/tree/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, path - ); - assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree/data/sub")); - } -``` - -- [ ] **Step 2: Run the tests to confirm they pass (URL logic is trivially testable)** - -```bash -cargo test -p huggingface_hub list_tree_url 2>&1 -``` - -Expected: both URL tests pass. - -- [ ] **Step 3: Implement `list_tree` on `HFBucket`** - -Add the following to the `impl HFBucket` block in `api/buckets.rs`. This uses `try_unfold` with a `VecDeque` buffer to yield one `TreeEntry` at a time while fetching pages lazily: - -```rust - /// Lists files and directories, yielding one entry at a time. - /// - /// Uses cursor-in-body pagination: the stream fetches the next page automatically - /// when the current page's entries are exhausted. No request is made until the - /// first item is polled. - pub fn list_tree( - &self, - path: &str, - params: ListTreeParams, - ) -> impl Stream> + '_ { - let base_url = if path.is_empty() { - format!( - "{}/api/buckets/{}/{}/tree", - self.client.inner.endpoint, self.namespace, self.repo - ) - } else { - format!( - "{}/api/buckets/{}/{}/tree/{}", - self.client.inner.endpoint, self.namespace, self.repo, path - ) - }; - let repo_id = self.repo_id(); - - // State: (buffered entries from current page, cursor for next page, whether we've fetched at all) - // cursor=None + fetched=false → fetch first page (no cursor param) - // cursor=Some(c) + fetched=_ → fetch next page with ?cursor=c - // cursor=None + fetched=true → no more pages, drain buffer then end - futures::stream::try_unfold( - (VecDeque::::new(), None::, false), - move |(mut pending, cursor, fetched)| { - let client = self.client.clone(); - let repo_id = repo_id.clone(); - let base_url = base_url.clone(); - async move { - // Yield buffered items before fetching a new page - if let Some(entry) = pending.pop_front() { - return Ok(Some((entry, (pending, cursor, fetched)))); - } - // No buffered items. Are there more pages to fetch? - if fetched && cursor.is_none() { - return Ok(None); - } - // Fetch next (or first) page - let mut req = client - .inner - .client - .get(&base_url) - .headers(client.auth_headers()); - if let Some(ref c) = cursor { - req = req.query(&[("cursor", c.as_str())]); - } - if let Some(l) = params.limit { - req = req.query(&[("limit", l.to_string().as_str())]); - } - if params.recursive { - req = req.query(&[("recursive", "true")]); - } - let resp = req.send().await.map_err(HFError::Request)?; - let resp = - check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; - let page: TreePage = resp.json().await.map_err(HFError::Json)?; - let next_cursor = page.next_cursor; - pending.extend(page.entries); - if let Some(entry) = pending.pop_front() { - Ok(Some((entry, (pending, next_cursor, true)))) - } else { - Ok(None) - } - } - }, - ) - } -``` - -Ensure `use std::collections::VecDeque;` is at the top of the file (it was included in Task 4). - -- [ ] **Step 4: Run all bucket tests** - -```bash -cargo test -p huggingface_hub list_tree 2>&1 -``` - -Expected: all `list_tree_url_*` tests pass, no compile errors. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): implement list_tree with cursor-in-body streaming pagination" -``` - ---- - -## Task 8: `get_paths_info`, `get_xet_write_token`, `get_xet_read_token` - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write failing tests** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[test] - fn xet_token_urls() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let write_url = format!( - "{}/api/buckets/{}/{}/xet-write-token", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - let read_url = format!( - "{}/api/buckets/{}/{}/xet-read-token", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(write_url.ends_with("/xet-write-token")); - assert!(read_url.ends_with("/xet-read-token")); - } - - #[test] - fn paths_info_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - let url = format!( - "{}/api/buckets/{}/{}/paths-info", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo - ); - assert!(url.ends_with("/paths-info")); - } -``` - -- [ ] **Step 2: Run the tests to confirm they pass** - -```bash -cargo test -p huggingface_hub xet_token_urls paths_info_url 2>&1 -``` - -Expected: both tests pass (URL construction tests don't need the methods yet). - -- [ ] **Step 3: Implement the three methods on `HFBucket`** - -Add to the `impl HFBucket` block in `api/buckets.rs`: - -```rust - /// Returns metadata for a batch of file paths. - pub async fn get_paths_info(&self, paths: Vec) -> Result> { - #[derive(serde::Serialize)] - struct Body { - paths: Vec, - } - - let resp = self - .client - .inner - .client - .post(format!("{}/paths-info", self.bucket_url())) - .headers(self.client.auth_headers()) - .json(&Body { paths }) - .send() - .await - .map_err(HFError::Request)?; - - let resp = check_bucket_response( - resp, - &self.repo_id(), - NotFoundContext::Entry { path: String::new() }, - ) - .await?; - resp.json().await.map_err(HFError::Json) - } - - /// Returns a short-lived JWT for uploading files to the Xet CAS. - /// Use the returned `cas_url` and `token` to push file bytes before calling `batch_files`. - pub async fn get_xet_write_token(&self) -> Result { - let resp = self - .client - .inner - .client - .get(format!("{}/xet-write-token", self.bucket_url())) - .headers(self.client.auth_headers()) - .send() - .await - .map_err(HFError::Request)?; - let resp = - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - resp.json().await.map_err(HFError::Json) - } - - /// Returns a short-lived JWT for downloading files from the Xet CAS directly. - pub async fn get_xet_read_token(&self) -> Result { - let resp = self - .client - .inner - .client - .get(format!("{}/xet-read-token", self.bucket_url())) - .headers(self.client.auth_headers()) - .send() - .await - .map_err(HFError::Request)?; - let resp = - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - resp.json().await.map_err(HFError::Json) - } -``` - -- [ ] **Step 4: Run all bucket tests** - -```bash -cargo test -p huggingface_hub -p huggingface_hub 2>&1 | grep "bucket\|FAILED\|ok" -``` - -Expected: all bucket tests pass, no new failures. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): implement get_paths_info, get_xet_write_token, get_xet_read_token" -``` - ---- - -## Task 9: `resolve_file` — redirect capture with header extraction - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write a failing test for `resolve_file` header parsing** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[test] - fn resolve_file_parses_link_header() { - // Verify the Link header parsing logic for xet-auth and xet-reconstruction-info - let link = r#"; rel="xet-auth", ; rel="xet-reconstruction-info""#; - let mut xet_auth = None; - let mut xet_reconstruction = None; - for part in link.split(',') { - let part = part.trim(); - if let Some((url_part, rel_part)) = part.split_once(';') { - let url = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); - let rel = rel_part.trim(); - if rel.contains("xet-auth") { - xet_auth = Some(url); - } else if rel.contains("xet-reconstruction-info") { - xet_reconstruction = Some(url); - } - } - } - assert_eq!(xet_auth.unwrap(), "https://auth.example.com/token"); - assert_eq!( - xet_reconstruction.unwrap(), - "https://xet.example.com/reconstruct/abc" - ); - } - - #[test] - fn resolve_file_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - // Note: no /api/ prefix for resolve - let url = format!( - "{}/buckets/{}/{}/resolve/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" - ); - assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); - assert!(!url.contains("/api/")); - } -``` - -- [ ] **Step 2: Run the tests to confirm the link parsing test passes** - -```bash -cargo test -p huggingface_hub resolve_file_parses_link resolve_file_url 2>&1 -``` - -Expected: both tests pass (they test pure logic, no network). - -- [ ] **Step 3: Implement `resolve_file` on `HFBucket`** - -Add to the `impl HFBucket` block in `api/buckets.rs`: - -```rust - /// Resolves a file path to a direct download URL. - /// - /// Uses the no-redirect client to capture the 302 `Location` header rather than - /// following it. Metadata is extracted from response headers: - /// `X-Linked-Size`, `X-XET-Hash`, `X-Linked-ETag`, `Last-Modified`, and `Link`. - pub async fn resolve_file(&self, path: &str) -> Result { - // Note: no /api/ prefix — this is the file-serving route, not the metadata API. - let url = format!( - "{}/buckets/{}/{}/resolve/{}", - self.client.inner.endpoint, self.namespace, self.repo, path - ); - let resp = self - .client - .inner - .no_redirect_client - .get(&url) - .headers(self.client.auth_headers()) - .send() - .await - .map_err(HFError::Request)?; - - if !resp.status().is_redirection() { - return Err( - check_bucket_response( - resp, - &self.repo_id(), - NotFoundContext::Entry { path: path.to_string() }, - ) - .await - .unwrap_err(), - ); - } - - let headers = resp.headers(); - - let location = headers - .get("location") - .and_then(|v| v.to_str().ok()) - .map(str::to_owned) - .ok_or_else(|| HFError::Http { - status: resp.status().as_u16(), - url: url.clone(), - body: "missing Location header".to_string(), - })?; - - let size = headers - .get("x-linked-size") - .and_then(|v| v.to_str().ok()) - .and_then(|s| s.parse::().ok()); - - let xet_hash = headers - .get("x-xet-hash") - .and_then(|v| v.to_str().ok()) - .map(str::to_owned); - - let etag = headers - .get("x-linked-etag") - .and_then(|v| v.to_str().ok()) - .map(str::to_owned); - - let last_modified = headers - .get("last-modified") - .and_then(|v| v.to_str().ok()) - .map(str::to_owned); - - // Parse Link header: ; rel="xet-auth", ; rel="xet-reconstruction-info" - let mut xet_auth_url = None; - let mut xet_reconstruction_url = None; - if let Some(link) = headers.get("link").and_then(|v| v.to_str().ok()) { - for part in link.split(',') { - let part = part.trim(); - if let Some((url_part, rel_part)) = part.split_once(';') { - let u = url_part.trim().trim_start_matches('<').trim_end_matches('>').to_string(); - if rel_part.contains("xet-auth") { - xet_auth_url = Some(u); - } else if rel_part.contains("xet-reconstruction-info") { - xet_reconstruction_url = Some(u); - } - } - } - } - - Ok(ResolvedFile { - url: location, - size, - xet_hash, - etag, - last_modified, - xet_auth_url, - xet_reconstruction_url, - }) - } -``` - -- [ ] **Step 4: Run all bucket tests** - -```bash -cargo test -p huggingface_hub 2>&1 | grep -E "bucket|resolve|FAILED|error" -``` - -Expected: all existing tests still pass, no compile errors. - -- [ ] **Step 5: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): implement resolve_file with redirect capture and header extraction" -``` - ---- - -## Task 10: `xet_resolve_file` (feature = `"xet"`) - -**Files:** -- Modify: `huggingface_hub/src/api/buckets.rs` - -- [ ] **Step 1: Write a failing test** - -Add to the `tests` module in `api/buckets.rs`: - -```rust - #[cfg(feature = "xet")] - #[test] - fn xet_resolve_file_url() { - let client = HFClientBuilder::new().build().unwrap(); - let bucket = client.bucket("myuser", "my-bucket"); - // Same URL as resolve_file — Accept header determines the response format - let url = format!( - "{}/buckets/{}/{}/resolve/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" - ); - assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); - } -``` - -- [ ] **Step 2: Run the test under the xet feature** - -```bash -cargo test -p huggingface_hub --features xet xet_resolve_file_url 2>&1 -``` - -Expected: compile error — `xet_resolve_file` method not found (the test will compile but the URL test itself may pass; confirm `XetFileInfo` is missing). - -- [ ] **Step 3: Implement `xet_resolve_file` on `HFBucket`** - -Add to the `impl HFBucket` block in `api/buckets.rs`: - -```rust - /// Resolves a file path and returns Xet reconstruction metadata. - /// - /// Sends `Accept: application/vnd.xet-fileinfo+json` to request the JSON response - /// instead of a redirect. Use the returned `reconstruction_url` to fetch chunk data - /// from the Xet CAS directly. - #[cfg(feature = "xet")] - pub async fn xet_resolve_file(&self, path: &str) -> Result { - let url = format!( - "{}/buckets/{}/{}/resolve/{}", - self.client.inner.endpoint, self.namespace, self.repo, path - ); - let resp = self - .client - .inner - .client - .get(&url) - .headers(self.client.auth_headers()) - .header("accept", "application/vnd.xet-fileinfo+json") - .send() - .await - .map_err(HFError::Request)?; - let resp = check_bucket_response( - resp, - &self.repo_id(), - NotFoundContext::Entry { path: path.to_string() }, - ) - .await?; - resp.json().await.map_err(HFError::Json) - } -``` - -- [ ] **Step 4: Run tests with the xet feature** - -```bash -cargo test -p huggingface_hub --features xet xet_resolve_file_url 2>&1 -``` - -Expected: test passes. - -- [ ] **Step 5: Confirm the build still works without the xet feature** - -```bash -cargo build -p huggingface_hub 2>&1 -``` - -Expected: compiles cleanly (no xet feature). - -- [ ] **Step 6: Commit** - -```bash -git add huggingface_hub/src/api/buckets.rs -git commit -m "feat(bucket): implement xet_resolve_file (feature = xet)" -``` - ---- - -## Task 11: Blocking wrappers (`HFBucketSync`) - -**Files:** -- Modify: `huggingface_hub/src/blocking.rs` -- Modify: `huggingface_hub/src/lib.rs` - -- [ ] **Step 1: Write a failing test** - -Add to `huggingface_hub/src/blocking.rs` (inside `#[cfg(test)]` if one exists, or add a new one): - -```rust -#[cfg(test)] -mod bucket_tests { - #[cfg(feature = "blocking")] - #[test] - fn bucket_sync_constructor() { - use crate::HFClientBuilder; - let client = crate::blocking::HFClientSync::from(HFClientBuilder::new().build().unwrap()); - let bucket = client.bucket("myuser", "my-bucket"); - assert_eq!(bucket.inner.namespace, "myuser"); - assert_eq!(bucket.inner.repo, "my-bucket"); - } -} -``` - -- [ ] **Step 2: Run the test to confirm it fails** - -```bash -cargo test -p huggingface_hub --features blocking bucket_sync_constructor 2>&1 -``` - -Expected: compile error — `HFClientSync::bucket` does not exist. - -- [ ] **Step 3: Add `HFBucketSync` struct to `blocking.rs`** - -In `huggingface_hub/src/blocking.rs`, add alongside `HFRepositorySync` and `HFSpaceSync`: - -```rust -/// Synchronous handle for Storage Bucket operations. -/// -/// Obtain via [`HFClientSync::bucket`]. All methods block the current thread. -#[cfg(feature = "blocking")] -#[derive(Clone)] -pub struct HFBucketSync { - pub(crate) inner: crate::repository::HFBucket, - pub(crate) runtime: std::sync::Arc, -} -``` - -- [ ] **Step 4: Add `HFClientSync::bucket()` in `blocking.rs`** - -In `blocking.rs`, find the `impl HFClientSync` block and add: - -```rust - /// Creates a synchronous bucket handle. - pub fn bucket( - &self, - namespace: impl Into, - repo: impl Into, - ) -> HFBucketSync { - HFBucketSync { - inner: self.inner.bucket(namespace, repo), - runtime: self.runtime.clone(), - } - } -``` - -- [ ] **Step 5: Add blocking methods to `HFBucketSync` in `blocking.rs`** - -Add an `impl HFBucketSync` block: - -```rust -#[cfg(feature = "blocking")] -impl HFBucketSync { - pub fn get(&self) -> crate::Result { - self.runtime.block_on(self.inner.get()) - } - - pub fn delete(&self) -> crate::Result<()> { - self.runtime.block_on(self.inner.delete()) - } - - pub fn update_settings( - &self, - params: crate::types::UpdateBucketParams, - ) -> crate::Result<()> { - self.runtime.block_on(self.inner.update_settings(params)) - } - - pub fn batch_files( - &self, - ops: Vec, - ) -> crate::Result { - self.runtime.block_on(self.inner.batch_files(ops)) - } - - pub fn list_tree( - &self, - path: &str, - params: crate::types::ListTreeParams, - ) -> crate::Result> { - use futures::StreamExt; - self.runtime.block_on(async { - let stream = self.inner.list_tree(path, params); - futures::pin_mut!(stream); - let mut items = Vec::new(); - while let Some(item) = stream.next().await { - items.push(item?); - } - Ok(items) - }) - } - - pub fn get_paths_info( - &self, - paths: Vec, - ) -> crate::Result> { - self.runtime.block_on(self.inner.get_paths_info(paths)) - } - - pub fn get_xet_write_token(&self) -> crate::Result { - self.runtime.block_on(self.inner.get_xet_write_token()) - } - - pub fn get_xet_read_token(&self) -> crate::Result { - self.runtime.block_on(self.inner.get_xet_read_token()) - } - - pub fn resolve_file(&self, path: &str) -> crate::Result { - self.runtime.block_on(self.inner.resolve_file(path)) - } - - #[cfg(feature = "xet")] - pub fn xet_resolve_file(&self, path: &str) -> crate::Result { - self.runtime.block_on(self.inner.xet_resolve_file(path)) - } -} -``` - -Also add a `list_buckets` blocking method on `HFClientSync`. Find the `impl HFClientSync` block and add: - -```rust - pub fn list_buckets( - &self, - namespace: &str, - ) -> crate::Result> { - use futures::StreamExt; - self.runtime.block_on(async { - let stream = self.inner.list_buckets(namespace); - futures::pin_mut!(stream); - let mut items = Vec::new(); - while let Some(item) = stream.next().await { - items.push(item?); - } - Ok(items) - }) - } - - pub fn create_bucket( - &self, - namespace: &str, - repo: &str, - params: crate::types::CreateBucketParams, - ) -> crate::Result { - self.runtime.block_on(self.inner.create_bucket(namespace, repo, params)) - } -``` - -- [ ] **Step 6: Export `HFBucketSync` from `lib.rs`** - -In `huggingface_hub/src/lib.rs`, find the blocking re-export line: - -```rust -#[cfg(feature = "blocking")] -pub use blocking::{HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; -``` - -Add `HFBucketSync` to this list: - -```rust -#[cfg(feature = "blocking")] -pub use blocking::{HFBucketSync, HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; -``` - -- [ ] **Step 7: Run the test to confirm it passes** - -```bash -cargo test -p huggingface_hub --features blocking bucket_sync_constructor 2>&1 -``` - -Expected: test passes. - -- [ ] **Step 8: Run the full test suite to check for regressions** - -```bash -cargo test -p huggingface_hub --features blocking 2>&1 | grep -E "FAILED|error\[" | head -20 -``` - -Expected: no failures. - -- [ ] **Step 9: Commit** - -```bash -git add huggingface_hub/src/blocking.rs huggingface_hub/src/lib.rs -git commit -m "feat(bucket): add HFBucketSync blocking wrappers" -``` - ---- - -## Task 12: Integration tests - -**Files:** -- Modify: `huggingface_hub/tests/integration_test.rs` - -- [ ] **Step 1: Write the integration tests (they will be skipped without credentials)** - -Add the following to `huggingface_hub/tests/integration_test.rs`. The `api()` and `write_enabled()` helpers are already defined in the file; add only the new test functions: - -```rust -// ---- HFBucket integration tests ---- - -/// Helper: creates a unique test bucket name to avoid collisions between runs. -fn test_bucket_name() -> String { - format!( - "test-bucket-{}", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_millis() - ) -} - -#[tokio::test] -async fn test_list_buckets() { - let Some(api) = api() else { return }; - let username = cached_username().await; - // list_buckets is a read operation — no HF_TEST_WRITE required - let buckets: Vec<_> = api - .list_buckets(username) - .collect::>() - .await - .into_iter() - .collect::>>() - .expect("list_buckets failed"); - // Simply assert the call succeeds; the user may have zero buckets - let _ = buckets; -} - -#[tokio::test] -async fn test_create_and_delete_bucket() { - let Some(api) = api() else { return }; - if !write_enabled() { - return; - } - let username = cached_username().await; - let name = test_bucket_name(); - - // Create - let created = api - .create_bucket( - username, - &name, - huggingface_hub::CreateBucketParams::builder().private(true).build(), - ) - .await - .expect("create_bucket failed"); - assert!(created.id.contains(&name)); - - // Get - let bucket = api.bucket(username, &name); - let info = bucket.get().await.expect("get failed"); - assert_eq!(info.name, name); - assert!(info.private); - - // Update settings - bucket - .update_settings( - huggingface_hub::UpdateBucketParams::builder().private(false).build(), - ) - .await - .expect("update_settings failed"); - - let info = bucket.get().await.unwrap(); - assert!(!info.private); - - // Delete - bucket.delete().await.expect("delete failed"); - - // Confirm gone - assert!(matches!( - bucket.get().await, - Err(huggingface_hub::HFError::RepoNotFound { .. }) - )); -} - -#[tokio::test] -async fn test_bucket_list_tree_empty() { - let Some(api) = api() else { return }; - if !write_enabled() { - return; - } - let username = cached_username().await; - let name = test_bucket_name(); - - api.create_bucket( - username, - &name, - huggingface_hub::CreateBucketParams::builder().build(), - ) - .await - .expect("create_bucket failed"); - - let bucket = api.bucket(username, &name); - - let entries: Vec<_> = bucket - .list_tree("", huggingface_hub::ListTreeParams::builder().build()) - .collect::>() - .await - .into_iter() - .collect::>>() - .expect("list_tree failed"); - - assert!(entries.is_empty(), "new bucket should have no files"); - - bucket.delete().await.unwrap(); -} - -#[tokio::test] -async fn test_get_xet_write_and_read_token() { - let Some(api) = api() else { return }; - if !write_enabled() { - return; - } - let username = cached_username().await; - let name = test_bucket_name(); - - api.create_bucket( - username, - &name, - huggingface_hub::CreateBucketParams::builder().build(), - ) - .await - .unwrap(); - - let bucket = api.bucket(username, &name); - - let write_tok = bucket.get_xet_write_token().await.expect("xet write token failed"); - assert!(!write_tok.token.is_empty()); - assert!(!write_tok.cas_url.is_empty()); - - let read_tok = bucket.get_xet_read_token().await.expect("xet read token failed"); - assert!(!read_tok.token.is_empty()); - - bucket.delete().await.unwrap(); -} -``` - -- [ ] **Step 2: Run the integration tests without credentials (they should be skipped)** - -```bash -cargo test -p huggingface_hub --test integration_test test_list_buckets test_create_and_delete_bucket test_bucket_list_tree test_get_xet 2>&1 -``` - -Expected: all 4 tests report "ok" (they exit early due to missing `HF_TOKEN`). - -- [ ] **Step 3: Run the full library test suite to check for regressions** - -```bash -cargo test -p huggingface_hub 2>&1 | grep -E "FAILED|error\[" | head -20 -``` - -Expected: no failures. - -- [ ] **Step 4: Commit** - -```bash -git add huggingface_hub/tests/integration_test.rs -git commit -m "test(bucket): add integration tests for create, get, update, delete, list_tree, xet tokens" -``` - ---- - -## Self-Review - -After all tasks are complete, run the full suite one final time: - -```bash -cargo test -p huggingface_hub 2>&1 -cargo test -p huggingface_hub --features blocking 2>&1 -cargo test -p huggingface_hub --features xet 2>&1 -cargo clippy -p huggingface_hub -- -D warnings 2>&1 -``` - -All expected clean. diff --git a/docs/specs/2026-04-08-hf-bucket-rust-client-design.md b/docs/specs/2026-04-08-hf-bucket-rust-client-design.md deleted file mode 100644 index a5da653..0000000 --- a/docs/specs/2026-04-08-hf-bucket-rust-client-design.md +++ /dev/null @@ -1,437 +0,0 @@ -# HFBucket Rust Client Design - -**Date:** 2026-04-08 -**Repo:** `huggingface/huggingface_hub_rust` -**Scope:** New `HFBucket` type + `HFClient` extensions + supporting types and error variants - -## Overview - -Add a `HFBucket` type to `huggingface_hub_rust` that exposes the HuggingFace Storage Buckets API (moon-landing). Buckets use content-addressable Xet storage rather than Git, making `HFRepository` the wrong abstraction — `HFBucket` is a separate handle type following the `HFSpace` precedent. - -This spec covers the raw API surface only (option A). Higher-level upload abstractions (wrapping the Xet write token + batch commit flow into a single `upload_file` call) are deferred to a follow-up. - -**Reference implementation:** `s3-gateway/src/hub_client/` in `huggingface/xet-catalogue`. - ---- - -## Module Structure - -Two new files, wired into their respective `mod.rs` files: - -``` -huggingface_hub/src/ -├── api/ -│ └── buckets.rs — HFBucket impl, HFClient::bucket / create_bucket / list_buckets -├── types/ -│ └── buckets.rs — all request/response types -``` - -`lib.rs` exports `HFBucket` at the crate root. No feature flag — buckets are part of the default library surface. - ---- - -## `HFBucket` Type - -```rust -pub struct HFBucket { - pub(crate) inner: Arc, - pub namespace: String, - pub repo: String, -} -``` - -Constructed via `HFClient::bucket()` — no I/O, no allocation beyond the string copies. - -### `HFClient` extensions - -```rust -// Constructs a bucket handle -pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> HFBucket - -// POST /api/buckets/:ns/:repo -pub async fn create_bucket( - &self, - namespace: &str, - repo: &str, - params: CreateBucketParams, -) -> Result - -// GET /api/buckets/:ns — Link-header paginated stream -pub fn list_buckets(&self, namespace: &str) -> impl Stream> -``` - -### `HFBucket` methods - -```rust -// GET /api/buckets/:ns/:repo -pub async fn get(&self) -> Result - -// DELETE /api/buckets/:ns/:repo -pub async fn delete(&self) -> Result<()> - -// PUT /api/buckets/:ns/:repo/settings -pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> - -// POST /api/buckets/:ns/:repo/batch (NDJSON) -pub async fn batch_files(&self, ops: Vec) -> Result - -// GET /api/buckets/:ns/:repo/tree[/:path] — cursor-from-body paginated stream -pub fn list_tree(&self, path: &str, params: ListTreeParams) -> impl Stream> - -// POST /api/buckets/:ns/:repo/paths-info -pub async fn get_paths_info(&self, paths: Vec) -> Result> - -// GET /api/buckets/:ns/:repo/xet-write-token -pub async fn get_xet_write_token(&self) -> Result - -// GET /api/buckets/:ns/:repo/xet-read-token -pub async fn get_xet_read_token(&self) -> Result - -// GET /buckets/:ns/:repo/resolve/:path (no /api/ prefix) -pub async fn resolve_file(&self, path: &str) -> Result - -// GET /buckets/:ns/:repo/resolve/:path with Xet Accept header -#[cfg(feature = "xet")] -pub async fn xet_resolve_file(&self, path: &str) -> Result -``` - ---- - -## Types (`src/types/buckets.rs`) - -### Parameter types - -All use `TypedBuilder`. `cursor` is omitted from list params — streaming handles pagination internally. - -```rust -#[derive(TypedBuilder, Serialize)] -pub struct CreateBucketParams { - #[builder(default, setter(strip_option))] - pub private: Option, - #[builder(default, setter(strip_option, into))] - #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] - pub resource_group_id: Option, - #[builder(default)] - #[serde(skip_serializing_if = "Vec::is_empty")] - pub cdn: Vec, -} - -#[derive(TypedBuilder, Serialize)] -pub struct UpdateBucketParams { - #[builder(default, setter(strip_option))] - #[serde(skip_serializing_if = "Option::is_none")] - pub private: Option, - #[builder(default, setter(strip_option))] - #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] - pub cdn_regions: Option>, -} - -#[derive(TypedBuilder)] -pub struct ListTreeParams { - #[builder(default, setter(strip_option))] - pub limit: Option, - #[builder(default)] - pub recursive: bool, -} -``` - -### Response types - -```rust -#[derive(Debug, Deserialize)] -pub struct BucketCreated { - pub url: String, - pub name: String, - pub id: String, -} - -#[derive(Debug, Deserialize)] -pub struct BucketInfo { - pub id: String, - pub name: String, - pub namespace: String, - pub private: bool, - #[serde(rename = "usedStorage")] - pub used_storage: u64, - #[serde(rename = "totalFiles")] - pub total_files: u64, - pub cdn: Vec, - pub region: String, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CdnRegion { - pub provider: String, - pub region: String, -} - -#[derive(Debug, Deserialize)] -pub struct XetToken { - pub token: String, - #[serde(rename = "casUrl")] - pub cas_url: String, - #[serde(rename = "expiresAt")] - pub expires_at: String, -} - -#[derive(Debug, Deserialize)] -pub struct PathInfo { - pub path: String, - pub size: u64, - #[serde(rename = "xetHash")] - pub xet_hash: String, - #[serde(rename = "contentType")] - pub content_type: String, - pub mtime: i64, -} - -#[derive(Debug, Deserialize)] -pub struct TreeEntry { - #[serde(rename = "type")] - pub entry_type: EntryType, - pub path: String, - pub size: Option, - #[serde(rename = "xetHash")] - pub xet_hash: Option, - #[serde(rename = "contentType")] - pub content_type: Option, -} - -#[derive(Debug, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum EntryType { File, Directory } -``` - -### `BucketOverview` (returned by `list_buckets`) - -`list_buckets` yields `BucketOverview`, which is distinct from `BucketInfo` returned by `get()`. The `id` field is the full `"namespace/repo"` string. - -```rust -#[derive(Debug, Deserialize)] -pub struct BucketOverview { - #[serde(rename = "_id")] - pub mongo_id: String, - pub id: String, // "namespace/repo" - pub author: String, - pub private: Option, // nullable - #[serde(rename = "repoType")] - pub repo_type: String, // always "bucket" - #[serde(rename = "createdAt")] - pub created_at: String, - #[serde(rename = "updatedAt")] - pub updated_at: String, - pub size: u64, - #[serde(rename = "totalFiles")] - pub total_files: u64, - #[serde(rename = "cdnRegions")] - pub cdn_regions: Vec, - #[serde(rename = "resourceGroup")] - pub resource_group: Option, -} - -#[derive(Debug, Deserialize)] -pub struct ResourceGroup { - pub id: String, - pub name: String, - #[serde(rename = "numUsers")] - pub num_users: Option, -} -``` - -An internal `TreePage` struct (not public) is used for `list_tree` pagination: - -```rust -#[derive(Deserialize)] -struct TreePage { - entries: Vec, - #[serde(rename = "nextCursor")] - next_cursor: Option, -} -``` - -### Batch types - -The protocol requires all `addFile` entries to precede any `deleteFile` entries. Enforced in `batch_files` via partition before serialization. - -```rust -#[derive(Debug, Serialize)] -#[serde(tag = "type")] -pub enum BatchOp { - #[serde(rename = "addFile")] - AddFile(AddFileOp), - #[serde(rename = "deleteFile")] - DeleteFile(DeleteFileOp), -} - -#[derive(Debug, Serialize)] -pub struct AddFileOp { - pub path: String, - #[serde(rename = "xetHash")] - pub xet_hash: String, - #[serde(rename = "contentType")] - pub content_type: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub mtime: Option, -} - -#[derive(Debug, Serialize)] -pub struct DeleteFileOp { - pub path: String, -} - -#[derive(Debug, Deserialize)] -pub struct BatchResult { - pub success: bool, - pub processed: u32, - pub succeeded: u32, - pub failed: Vec, -} - -#[derive(Debug, Deserialize)] -pub struct BatchFailure { - pub path: String, - pub error: String, -} -``` - -### `ResolvedFile` - -Constructed from redirect response headers, not a JSON body. The `Link` header contains two entries identified by `rel` name: - -``` -Link: ; rel="xet-auth", ; rel="xet-reconstruction-info" -``` - -```rust -#[derive(Debug)] -pub struct ResolvedFile { - pub url: String, // Location header - pub size: Option, // X-Linked-Size - pub xet_hash: Option, // X-XET-Hash - pub etag: Option, // X-Linked-ETag - pub last_modified: Option, // Last-Modified - pub xet_auth_url: Option, // Link rel="xet-auth" - pub xet_reconstruction_url: Option, // Link rel="xet-reconstruction-info" -} -``` - -### `XetFileInfo` (feature = `"xet"`) - -```rust -#[derive(Debug, Deserialize)] -pub struct XetFileInfo { - pub hash: String, - #[serde(rename = "refreshUrl")] - pub refresh_url: String, - #[serde(rename = "reconstructionUrl")] - pub reconstruction_url: String, - pub etag: String, - pub size: u64, - #[serde(rename = "contentType")] - pub content_type: String, -} -``` - ---- - -## Pagination - -**`list_buckets`** returns a JSON array with pagination via `Link` response headers (`rel="next"`), identical to the model/dataset list endpoints. The existing `paginate()` helper can be reused directly. - -**`list_tree`** returns a JSON object `{ entries, nextCursor }` with cursor-in-body pagination. It uses `futures::stream::try_unfold` over cursor-from-body pagination, the same pattern as `pagination.rs` but reading `next_cursor` from the deserialized body rather than a `Link` header. - -`list_tree` query params: `limit` (if set), `recursive` (if true), `cursor` (if continuing). Path suffix appended only when non-empty: - -``` -/api/buckets/:ns/:repo/tree (empty path) -/api/buckets/:ns/:repo/tree/:path (non-empty path) -``` - -Both streams are lazy — no HTTP request is made until the caller polls the first item. - ---- - -## `resolve_file` and `xet_resolve_file` - -**`resolve_file`** uses `inner.no_redirect_client` (already on `HFClientInner`) to prevent automatic redirect following. Expects a 3xx response. Reads `Location` and the metadata headers to populate `ResolvedFile`. Any non-redirect response (including 200) is passed to the standard error handler. - -**`xet_resolve_file`** (feature = `"xet"`) uses the regular client. Sends `Accept: application/vnd.xet-fileinfo+json`. Expects a 200 JSON body deserializing to `XetFileInfo`. - ---- - -## `batch_files` NDJSON - -``` -POST /api/buckets/:ns/:repo/batch -Content-Type: application/x-ndjson -``` - -Implementation: -1. Partition `ops` into adds and deletes (preserving within-group order). -2. Serialize each op with `serde_json::to_string`, append `\n`. -3. Concatenate into a single string body — adds first, then deletes. -4. Serialization errors surface as `HFError::Json`. - ---- - -## Error Handling - -Four new variants added to `HFError`. They will only be emitted by bucket API methods in this PR; updating existing non-bucket methods to use them is out of scope. - -```rust -pub enum HFError { - // ... existing variants ... - #[error("forbidden")] - Forbidden, - #[error("conflict: {0}")] - Conflict(String), // carries response body - #[error("rate limited")] - RateLimited, - #[error("quota exceeded")] - QuotaExceeded, -} -``` - -Full status mapping for bucket methods: - -| Status | `HFError` variant | -|--------|-------------------| -| 401 | `AuthRequired` (existing) | -| 403 | `Forbidden` (new) | -| 404 on bucket | `RepoNotFound { repo_id: "ns/repo" }` (existing) | -| 404 on file | `EntryNotFound { path, repo_id: "ns/repo" }` (existing) | -| 409 | `Conflict(body)` (new) | -| 429 | `RateLimited` (new) | -| 507 | `QuotaExceeded` (new) | -| other | `Http { status, url, body }` (existing) | - -The 404 distinction is made at the call site: bucket-level methods (`get`, `delete`, `update_settings`) use `RepoNotFound`; file-level methods (`get_paths_info`, `resolve_file`) use `EntryNotFound`. - ---- - -## Blocking API - -All non-streaming `HFBucket` methods and `HFClient::create_bucket` get sync wrappers via the existing `sync_api!` macro. Streaming methods (`list_buckets`, `list_tree`) use `sync_api_stream!`, which wraps the async stream in a blocking iterator. - -`HFClientSync::bucket()` returns `HFBucketSync`. - ---- - -## Testing - -**Unit tests** in `#[cfg(test)]` within `api/buckets.rs`: -- URL construction for each endpoint (including the path-suffix logic in `list_tree`) -- `resolve_file` header parsing (Location, X-Linked-Size, X-XET-Hash, Link) -- `batch_files` NDJSON ordering (adds before deletes) - -**Integration tests** in `tests/integration_test.rs`, following existing patterns: -- Skip if `HF_TOKEN` absent -- Write operations (`create_bucket`, `batch_files`, `delete`) behind `HF_TEST_WRITE=1` -- Tests create and tear down their own bucket — no dependency on pre-existing test fixtures -- One integration test per public method; `list_buckets` and `list_tree` collect the stream into a `Vec` and assert on shape/contents - ---- - -## Open Items - -None. From e36b9872d670fdb097fd82d72a3ee8bba524c5da Mon Sep 17 00:00:00 2001 From: Joseph Godlewski Date: Thu, 9 Apr 2026 10:54:20 -0700 Subject: [PATCH 4/5] fix: refactoring and adding comments --- huggingface_hub/src/api/buckets.rs | 80 ++++++++++----- huggingface_hub/src/blocking.rs | 58 +---------- huggingface_hub/src/buckets.rs | 40 ++++++++ huggingface_hub/src/lib.rs | 2 + huggingface_hub/src/repository.rs | 21 ---- huggingface_hub/src/types/buckets.rs | 116 +++++++++++++++++++--- huggingface_hub/tests/integration_test.rs | 29 +++--- 7 files changed, 216 insertions(+), 130 deletions(-) create mode 100644 huggingface_hub/src/buckets.rs diff --git a/huggingface_hub/src/api/buckets.rs b/huggingface_hub/src/api/buckets.rs index ea2bcfd..ec5877c 100644 --- a/huggingface_hub/src/api/buckets.rs +++ b/huggingface_hub/src/api/buckets.rs @@ -3,12 +3,12 @@ use std::collections::VecDeque; use futures::Stream; use url::Url; +use crate::buckets::HFBucket; use crate::error::{HFError, NotFoundContext}; use crate::pagination::parse_link_header_next; -use crate::repository::HFBucket; use crate::types::{ - BatchOp, BatchResult, BucketCreated, BucketOverview, CreateBucketParams, ListTreeParams, PathInfo, ResolvedFile, - TreeEntry, UpdateBucketParams, XetToken, + BatchOp, BatchResult, BucketCreated, BucketOverview, CreateBucketParams, ListTreeParams, ResolvedFile, TreeEntry, + UpdateBucketParams, XetToken, }; use crate::{HFClient, Result}; @@ -54,7 +54,7 @@ impl HFBucket { } /// Returns metadata about this bucket. - pub async fn get(&self) -> Result { + pub async fn info(&self) -> Result { let resp = self .client .inner @@ -67,20 +67,6 @@ impl HFBucket { Ok(resp.json().await?) } - /// Permanently deletes this bucket and all its files. - pub async fn delete(&self) -> Result<()> { - let resp = self - .client - .inner - .client - .delete(self.bucket_url()) - .headers(self.client.auth_headers()) - .send() - .await?; - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; - Ok(()) - } - /// Updates visibility or CDN configuration for this bucket. pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> { let resp = self @@ -183,7 +169,9 @@ impl HFBucket { } /// Returns metadata for a batch of file paths. - pub async fn get_paths_info(&self, paths: Vec) -> Result> { + /// + /// Paths that do not exist in the bucket are silently omitted from the result. + pub async fn get_paths_info(&self, paths: Vec) -> Result> { #[derive(serde::Serialize)] struct Body { paths: Vec, @@ -332,6 +320,15 @@ impl HFBucket { } impl HFClient { + /// Permanently deletes a bucket and all of its files. + pub async fn delete_bucket(&self, namespace: &str, repo: &str) -> Result<()> { + let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); + let repo_id = format!("{}/{}", namespace, repo); + let resp = self.inner.client.delete(&url).headers(self.auth_headers()).send().await?; + check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + Ok(()) + } + /// Creates a new bucket owned by `namespace`. pub async fn create_bucket( &self, @@ -355,10 +352,47 @@ impl HFClient { /// Returns a paginated stream of all buckets owned by `namespace`. /// Pagination is driven by `Link` response headers. - pub fn list_buckets(&self, namespace: &str) -> impl Stream> + '_ { - let url = Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace)) - .expect("endpoint is a valid base URL"); - self.paginate(url, vec![], None) + pub fn list_buckets(&self, namespace: &str) -> Result> + '_> { + let url = Url::parse(&format!("{}/api/buckets/{}", self.inner.endpoint, namespace))?; + Ok(self.paginate(url, vec![], None)) + } +} + +sync_api! { + impl HFBucket -> HFBucketSync { + fn info(&self) -> Result; + fn update_settings(&self, params: UpdateBucketParams) -> Result<()>; + fn batch_files(&self, ops: Vec) -> Result; + fn get_paths_info(&self, paths: Vec) -> Result>; + fn get_xet_write_token(&self) -> Result; + fn get_xet_read_token(&self) -> Result; + fn resolve_file(&self, path: &str) -> Result; + } +} + +sync_api_stream! { + impl HFBucket -> HFBucketSync { + fn list_tree(&self, path: &str, params: ListTreeParams) -> TreeEntry; + } +} + +sync_api! { + #[cfg(feature = "xet")] + impl HFBucket -> HFBucketSync { + fn xet_resolve_file(&self, path: &str) -> Result; + } +} + +sync_api! { + impl HFClient -> HFClientSync { + fn delete_bucket(&self, namespace: &str, repo: &str) -> Result<()>; + fn create_bucket(&self, namespace: &str, repo: &str, params: CreateBucketParams) -> Result; + } +} + +sync_api_stream! { + impl HFClient -> HFClientSync { + fn list_buckets(&self, namespace: &str) -> BucketOverview; } } diff --git a/huggingface_hub/src/blocking.rs b/huggingface_hub/src/blocking.rs index 31d2f8e..a2c62ff 100644 --- a/huggingface_hub/src/blocking.rs +++ b/huggingface_hub/src/blocking.rs @@ -53,7 +53,7 @@ pub struct HFSpaceSync { /// Obtain via [`HFClientSync::bucket`]. All methods block the current thread. #[derive(Clone)] pub struct HFBucketSync { - pub(crate) inner: crate::repository::HFBucket, + pub(crate) inner: crate::buckets::HFBucket, pub(crate) runtime: Arc, } @@ -137,19 +137,6 @@ impl HFClientSync { runtime: self.runtime.clone(), } } - - pub fn create_bucket( - &self, - namespace: &str, - repo: &str, - params: crate::types::CreateBucketParams, - ) -> Result { - self.runtime.block_on(self.inner.create_bucket(namespace, repo, params)) - } - - pub fn list_buckets(&self, namespace: &str) -> Result> { - collect_stream(self.runtime.as_ref(), self.inner.list_buckets(namespace)) - } } impl Deref for HFClientSync { @@ -183,49 +170,6 @@ impl Deref for HFRepositorySync { } } -impl HFBucketSync { - pub fn get(&self) -> Result { - self.runtime.block_on(self.inner.get()) - } - - pub fn delete(&self) -> Result<()> { - self.runtime.block_on(self.inner.delete()) - } - - pub fn update_settings(&self, params: crate::types::UpdateBucketParams) -> Result<()> { - self.runtime.block_on(self.inner.update_settings(params)) - } - - pub fn batch_files(&self, ops: Vec) -> Result { - self.runtime.block_on(self.inner.batch_files(ops)) - } - - pub fn list_tree(&self, path: &str, params: crate::types::ListTreeParams) -> Result> { - collect_stream(self.runtime.as_ref(), self.inner.list_tree(path, params)?) - } - - pub fn get_paths_info(&self, paths: Vec) -> Result> { - self.runtime.block_on(self.inner.get_paths_info(paths)) - } - - pub fn get_xet_write_token(&self) -> Result { - self.runtime.block_on(self.inner.get_xet_write_token()) - } - - pub fn get_xet_read_token(&self) -> Result { - self.runtime.block_on(self.inner.get_xet_read_token()) - } - - pub fn resolve_file(&self, path: &str) -> Result { - self.runtime.block_on(self.inner.resolve_file(path)) - } - - #[cfg(feature = "xet")] - pub fn xet_resolve_file(&self, path: &str) -> Result { - self.runtime.block_on(self.inner.xet_resolve_file(path)) - } -} - impl HFSpaceSync { /// Creates a blocking space handle for the given owner and name. pub fn new(client: HFClientSync, owner: impl Into, name: impl Into) -> Self { diff --git a/huggingface_hub/src/buckets.rs b/huggingface_hub/src/buckets.rs new file mode 100644 index 0000000..f4597d2 --- /dev/null +++ b/huggingface_hub/src/buckets.rs @@ -0,0 +1,40 @@ +use crate::HFClient; + +/// Handle for operations on a single HuggingFace Storage Bucket. +/// +/// Obtain via [`HFClient::bucket`]. Every method adds `Authorization: Bearer ` +/// using the token configured on the client. +/// +/// # Example +/// +/// ```rust,no_run +/// # #[tokio::main] +/// # async fn main() -> huggingface_hub::Result<()> { +/// let client = huggingface_hub::HFClient::new()?; +/// let bucket = client.bucket("my-org", "my-bucket"); +/// let overview = bucket.info().await?; +/// println!("Bucket size: {} bytes", overview.size); +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] +pub struct HFBucket { + pub(crate) client: HFClient, + /// The namespace (user or organization) that owns the bucket. + pub namespace: String, + /// The bucket name within the namespace. + pub repo: String, +} + +impl HFClient { + /// Creates a handle for operations on a single Storage Bucket. + /// + /// No I/O is performed — this simply binds the namespace and name to the client. + pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> HFBucket { + HFBucket { + client: self.clone(), + namespace: namespace.into(), + repo: repo.into(), + } + } +} diff --git a/huggingface_hub/src/lib.rs b/huggingface_hub/src/lib.rs index da8d650..2710207 100644 --- a/huggingface_hub/src/lib.rs +++ b/huggingface_hub/src/lib.rs @@ -21,6 +21,7 @@ mod macros; pub mod api; #[cfg(feature = "blocking")] pub mod blocking; +pub mod buckets; pub mod cache; pub mod client; pub(crate) mod constants; @@ -34,6 +35,7 @@ pub mod xet; #[cfg(feature = "blocking")] pub use blocking::{HFBucketSync, HFClientSync, HFRepoSync, HFRepositorySync, HFSpaceSync}; +pub use buckets::HFBucket; pub use client::{HFClient, HFClientBuilder}; #[cfg(feature = "cli")] #[doc(hidden)] diff --git a/huggingface_hub/src/repository.rs b/huggingface_hub/src/repository.rs index 1a79fb8..f573d19 100644 --- a/huggingface_hub/src/repository.rs +++ b/huggingface_hub/src/repository.rs @@ -462,33 +462,12 @@ pub struct SpaceVariableDeleteParams { pub key: String, } -/// Handle for operations on a single HuggingFace Storage Bucket. -/// -/// Obtain via [`HFClient::bucket`]. Every method adds `Authorization: Bearer ` -/// using the token configured on the client. -#[derive(Clone)] -pub struct HFBucket { - pub(crate) client: crate::HFClient, - pub namespace: String, - pub repo: String, -} - impl HFClient { /// Create an [`HFRepository`] handle for any repo type. pub fn repo(&self, repo_type: RepoType, owner: impl Into, name: impl Into) -> HFRepository { HFRepository::new(self.clone(), repo_type, owner, name) } - /// Creates a handle for operations on a single Storage Bucket. - /// No I/O is performed. - pub fn bucket(&self, namespace: impl Into, repo: impl Into) -> crate::repository::HFBucket { - crate::repository::HFBucket { - client: self.clone(), - namespace: namespace.into(), - repo: repo.into(), - } - } - /// Create an [`HFRepository`] handle for a model repository. pub fn model(&self, owner: impl Into, name: impl Into) -> HFRepository { self.repo(RepoType::Model, owner, name) diff --git a/huggingface_hub/src/types/buckets.rs b/huggingface_hub/src/types/buckets.rs index 4506176..6072c01 100644 --- a/huggingface_hub/src/types/buckets.rs +++ b/huggingface_hub/src/types/buckets.rs @@ -3,194 +3,278 @@ use typed_builder::TypedBuilder; // --- Parameter types --- +/// Parameters for [`HFClient::create_bucket`]. #[derive(Debug, Clone, TypedBuilder, Serialize)] pub struct CreateBucketParams { + /// Whether the bucket should be private. Defaults to public when omitted. #[builder(default, setter(strip_option))] #[serde(skip_serializing_if = "Option::is_none")] pub private: Option, + /// Resource group to assign the bucket to. #[builder(default, setter(strip_option, into))] #[serde(rename = "resourceGroupId", skip_serializing_if = "Option::is_none")] pub resource_group_id: Option, + /// CDN regions to enable for this bucket at creation time. #[builder(default)] #[serde(skip_serializing_if = "Vec::is_empty")] pub cdn: Vec, } +/// Parameters for [`HFBucket::update_settings`]. #[derive(Debug, Clone, TypedBuilder, Serialize)] pub struct UpdateBucketParams { + /// Change the bucket's visibility. Pass `true` to make it private, `false` for public. #[builder(default, setter(strip_option))] #[serde(skip_serializing_if = "Option::is_none")] pub private: Option, + /// Replace the full set of CDN regions. Pass an empty vec to remove all CDN regions. #[builder(default, setter(strip_option))] #[serde(rename = "cdnRegions", skip_serializing_if = "Option::is_none")] pub cdn_regions: Option>, } +/// Parameters for [`HFBucket::list_tree`]. #[derive(Debug, Clone, TypedBuilder)] pub struct ListTreeParams { + /// Maximum number of entries to return per page. The server default is 1000; maximum is 10 000. #[builder(default, setter(strip_option))] pub limit: Option, + /// When `true`, return all entries under the prefix recursively. + /// When `false` (the default), only top-level entries are returned and sub-directories + /// are collapsed into a single [`EntryType::Directory`] entry. #[builder(default)] pub recursive: bool, } // --- Response types --- +/// Returned by [`HFClient::create_bucket`] on success. #[derive(Debug, Clone, Deserialize)] pub struct BucketCreated { + /// Full URL of the newly created bucket (e.g. `https://huggingface.co/buckets/my-org/my-bucket`). pub url: String, + /// Bucket identifier in `namespace/name` format. pub name: String, + /// Opaque server-side ID for the bucket. pub id: String, } +/// A CDN region specifying a cloud provider and geographic region. +/// +/// Used in [`CreateBucketParams`], [`UpdateBucketParams`], and [`BucketOverview`]. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CdnRegion { + /// Cloud provider (e.g. `"gcp"` or `"aws"`). pub provider: String, + /// Geographic region identifier (e.g. `"us"` or `"eu"`). pub region: String, } -#[derive(Debug, Clone, Deserialize)] +/// Metadata about a Storage Bucket, as returned by [`HFBucket::info`] and [`HFClient::list_buckets`]. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct BucketOverview { + /// Internal MongoDB document ID. #[serde(rename = "_id")] pub mongo_id: String, + /// Bucket identifier in `namespace/name` format. pub id: String, + /// Namespace (user or organization) that owns the bucket. pub author: String, + /// Whether the bucket is private. `None` means the server did not specify. pub private: Option, + /// Repository type tag — always `"bucket"`. #[serde(rename = "repoType")] pub repo_type: String, + /// ISO 8601 creation timestamp. #[serde(rename = "createdAt")] pub created_at: String, + /// ISO 8601 last-updated timestamp. #[serde(rename = "updatedAt")] pub updated_at: String, + /// Total storage used by the bucket, in bytes. pub size: u64, + /// Number of files currently stored in the bucket. #[serde(rename = "totalFiles")] pub total_files: u64, + /// CDN regions configured for this bucket. #[serde(rename = "cdnRegions")] pub cdn_regions: Vec, + /// Resource group this bucket belongs to, if any. #[serde(rename = "resourceGroup")] pub resource_group: Option, } -#[derive(Debug, Clone, Deserialize)] +/// A resource group that a bucket can be associated with. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct ResourceGroup { + /// Opaque resource group ID. pub id: String, + /// Human-readable resource group name. pub name: String, + /// Number of members in the resource group, if returned by the server. #[serde(rename = "numUsers")] pub num_users: Option, } +/// A short-lived token for authenticating directly against the Xet CAS (content-addressable storage). +/// +/// Returned by [`HFBucket::get_xet_write_token`] and [`HFBucket::get_xet_read_token`]. #[derive(Debug, Clone, Deserialize)] pub struct XetToken { + /// Bearer token to include in requests to the Xet CAS. #[serde(rename = "accessToken")] pub access_token: String, + /// Base URL of the Xet CAS server. #[serde(rename = "casUrl")] pub cas_url: String, - /// Epoch time (s) + /// Token expiry as a Unix epoch timestamp (seconds), following the standard JWT `exp` convention. #[serde(rename = "exp")] pub expires_at: u64, } -#[derive(Debug, Clone, Deserialize)] -pub struct PathInfo { - pub path: String, - pub size: u64, - #[serde(rename = "xetHash")] - pub xet_hash: String, - #[serde(rename = "contentType")] - pub content_type: String, - pub mtime: i64, -} - +/// A single entry returned by [`HFBucket::list_tree`] and [`HFBucket::get_paths_info`]. +/// +/// Can represent either a file or a directory. File-only fields (`size`, `xet_hash`, +/// `content_type`, `mtime`) are `None` for directory entries. #[derive(Debug, Clone, Deserialize)] pub struct TreeEntry { + /// Whether this entry is a file or a directory. #[serde(rename = "type")] pub entry_type: EntryType, + /// Path of the entry relative to the bucket root. pub path: String, - /// ISO 8601 Datetime + /// ISO 8601 timestamp of when this entry was added to the bucket. #[serde(rename = "uploadedAt")] pub uploaded_at: String, - /// ISO 8601 Datetime + /// Original file modification time as an ISO 8601 timestamp, if preserved at upload. pub mtime: Option, + /// File size in bytes. `None` for directory entries. pub size: Option, + /// Content-addressable Xet hash of the file. `None` for directory entries. #[serde(rename = "xetHash")] pub xet_hash: Option, + /// MIME content type of the file. `None` for directory entries. #[serde(rename = "contentType")] pub content_type: Option, } +/// Whether a [`TreeEntry`] is a file or a directory. #[derive(Debug, Clone, Deserialize)] #[serde(rename_all = "lowercase")] pub enum EntryType { + /// A regular file stored in the bucket. File, + /// A virtual directory prefix (only appears when `recursive` is `false`). Directory, } // --- Batch types --- +/// A single operation in a [`HFBucket::batch_files`] call. +/// +/// All [`BatchOp::AddFile`] operations must precede all [`BatchOp::DeleteFile`] operations — +/// the client enforces this automatically. #[derive(Debug, Clone, Serialize)] #[serde(tag = "type")] pub enum BatchOp { + /// Add or overwrite a file at the given path. + /// + /// The file contents must already be present in the Xet CAS; obtain a write token + /// via [`HFBucket::get_xet_write_token`] before uploading. #[serde(rename = "addFile")] AddFile(AddFileOp), + /// Remove a file from the bucket by path. #[serde(rename = "deleteFile")] DeleteFile(DeleteFileOp), } +/// Payload for a [`BatchOp::AddFile`] operation. #[derive(Debug, Clone, Serialize)] pub struct AddFileOp { + /// Destination path within the bucket. pub path: String, + /// Content-addressable Xet hash of the file, obtained after uploading to the CAS. #[serde(rename = "xetHash")] pub xet_hash: String, + /// MIME content type of the file (e.g. `"application/octet-stream"`). #[serde(rename = "contentType")] pub content_type: String, + /// Original file modification time as a Unix timestamp in milliseconds, if known. #[serde(skip_serializing_if = "Option::is_none")] pub mtime: Option, } +/// Payload for a [`BatchOp::DeleteFile`] operation. #[derive(Debug, Clone, Serialize)] pub struct DeleteFileOp { + /// Path of the file to remove from the bucket. pub path: String, } +/// Result of a [`HFBucket::batch_files`] call. #[derive(Debug, Clone, Deserialize)] pub struct BatchResult { + /// `true` if every operation in the batch succeeded. pub success: bool, + /// Total number of operations attempted. pub processed: u32, + /// Number of operations that completed successfully. pub succeeded: u32, + /// Details of any operations that failed. pub failed: Vec, } +/// A single failed operation within a [`BatchResult`]. #[derive(Debug, Clone, Deserialize)] pub struct BatchFailure { + /// Path of the file whose operation failed. pub path: String, + /// Server-provided error message. pub error: String, } // --- resolve_file types --- +/// A resolved direct download URL for a bucket file, returned by [`HFBucket::resolve_file`]. #[derive(Debug, Clone)] pub struct ResolvedFile { + /// Direct download URL (the `Location` from the server's 302 redirect). pub url: String, + /// File size in bytes, if provided by the server. pub size: Option, + /// Content-addressable Xet hash of the file, if provided. pub xet_hash: Option, + /// ETag of the file, if provided. pub etag: Option, + /// `Last-Modified` header value, if provided. pub last_modified: Option, + /// URL to obtain a fresh Xet read token for this file, if provided. pub xet_auth_url: Option, + /// URL pointing to the Xet CAS reconstruction manifest for this file, if provided. pub xet_reconstruction_url: Option, } // --- xet_resolve_file type (feature = "xet") --- +/// Xet reconstruction metadata for a bucket file, returned by [`HFBucket::xet_resolve_file`]. +/// +/// Only available with the `xet` feature enabled. #[cfg(feature = "xet")] #[derive(Debug, Clone, Deserialize)] pub struct XetFileInfo { + /// Content-addressable Xet hash of the file. pub hash: String, + /// URL to obtain a fresh Xet read token. #[serde(rename = "refreshUrl")] pub refresh_url: String, + /// URL pointing to the Xet CAS reconstruction manifest. #[serde(rename = "reconstructionUrl")] pub reconstruction_url: String, + /// ETag of the file. pub etag: String, + /// File size in bytes. pub size: u64, + /// MIME content type of the file. #[serde(rename = "contentType")] pub content_type: String, } diff --git a/huggingface_hub/tests/integration_test.rs b/huggingface_hub/tests/integration_test.rs index a4c8575..5cc72ae 100644 --- a/huggingface_hub/tests/integration_test.rs +++ b/huggingface_hub/tests/integration_test.rs @@ -902,13 +902,16 @@ fn test_bucket_name() -> String { async fn test_list_buckets() { let Some(api) = api() else { return }; let username = cached_username().await; - let buckets: Vec<_> = api - .list_buckets(username) - .collect::>() - .await - .into_iter() - .collect::>>() - .expect("list_buckets failed"); + let stream = api.list_buckets(username).expect("list_buckets failed"); + let buckets: Vec<_> = { + use futures::StreamExt; + futures::pin_mut!(stream); + let mut items = Vec::new(); + while let Some(item) = stream.next().await { + items.push(item.expect("list_buckets item failed")); + } + items + }; let _ = buckets; } @@ -928,7 +931,7 @@ async fn test_create_and_delete_bucket() { assert!(created.name.contains(&name)); let bucket = api.bucket(username, &name); - let info = bucket.get().await.expect("get failed"); + let info = bucket.info().await.expect("info failed"); assert_eq!(info.id, format!("{username}/{name}")); assert!(info.private.unwrap()); @@ -937,12 +940,12 @@ async fn test_create_and_delete_bucket() { .await .expect("update_settings failed"); - let info = bucket.get().await.unwrap(); + let info = bucket.info().await.unwrap(); assert!(!info.private.unwrap()); - bucket.delete().await.expect("delete failed"); + api.delete_bucket(username, &name).await.expect("delete_bucket failed"); - assert!(matches!(bucket.get().await, Err(huggingface_hub::HFError::RepoNotFound { .. }))); + assert!(matches!(bucket.info().await, Err(huggingface_hub::HFError::RepoNotFound { .. }))); } #[tokio::test] @@ -971,7 +974,7 @@ async fn test_bucket_list_tree_empty() { assert!(entries.is_empty(), "new bucket should have no files"); - bucket.delete().await.unwrap(); + api.delete_bucket(username, &name).await.unwrap(); } #[tokio::test] @@ -996,5 +999,5 @@ async fn test_get_xet_write_and_read_token() { let read_tok = bucket.get_xet_read_token().await.expect("xet read token failed"); assert!(!read_tok.access_token.is_empty()); - bucket.delete().await.unwrap(); + api.delete_bucket(username, &name).await.unwrap(); } From f8beef436f033441b4e079fd8b3cb468e4e57cdf Mon Sep 17 00:00:00 2001 From: Joseph Godlewski Date: Thu, 9 Apr 2026 14:42:05 -0700 Subject: [PATCH 5/5] fix: rename repo->bucket --- huggingface_hub/src/api/buckets.rs | 130 +++++++++++----------- huggingface_hub/src/bin/hfrs/main.rs | 3 + huggingface_hub/src/buckets.rs | 4 +- huggingface_hub/src/client.rs | 2 +- huggingface_hub/src/error.rs | 5 + huggingface_hub/tests/integration_test.rs | 17 ++- 6 files changed, 90 insertions(+), 71 deletions(-) diff --git a/huggingface_hub/src/api/buckets.rs b/huggingface_hub/src/api/buckets.rs index ec5877c..ed722a8 100644 --- a/huggingface_hub/src/api/buckets.rs +++ b/huggingface_hub/src/api/buckets.rs @@ -1,6 +1,7 @@ use std::collections::VecDeque; use futures::Stream; +use reqwest::header::CONTENT_TYPE; use url::Url; use crate::buckets::HFBucket; @@ -13,10 +14,10 @@ use crate::types::{ use crate::{HFClient, Result}; /// Maps HTTP status codes to `HFError` variants for bucket API responses. -/// Bucket-level 404s map to `RepoNotFound`; file-level 404s map to `EntryNotFound`. +/// Bucket-level 404s map to `BucketNotFound`; file-level 404s map to `EntryNotFound`. pub(crate) async fn check_bucket_response( response: reqwest::Response, - repo_id: &str, + bucket_name: &str, not_found_ctx: NotFoundContext, ) -> Result { if response.status().is_success() { @@ -29,12 +30,12 @@ pub(crate) async fn check_bucket_response( 401 => HFError::AuthRequired, 403 => HFError::Forbidden, 404 => match not_found_ctx { - NotFoundContext::Repo => HFError::RepoNotFound { - repo_id: repo_id.to_string(), + NotFoundContext::Bucket => HFError::BucketNotFound { + bucket_name: bucket_name.to_string(), }, NotFoundContext::Entry { path } => HFError::EntryNotFound { path, - repo_id: repo_id.to_string(), + repo_id: bucket_name.to_string(), }, _ => HFError::Http { status, url, body }, }, @@ -45,12 +46,12 @@ pub(crate) async fn check_bucket_response( } impl HFBucket { - fn repo_id(&self) -> String { - format!("{}/{}", self.namespace, self.repo) + fn bucket_name(&self) -> String { + format!("{}/{}", self.namespace, self.bucket) } fn bucket_url(&self) -> String { - format!("{}/api/buckets/{}/{}", self.client.inner.endpoint, self.namespace, self.repo) + format!("{}/api/buckets/{}/{}", self.client.inner.endpoint, self.namespace, self.bucket) } /// Returns metadata about this bucket. @@ -63,22 +64,22 @@ impl HFBucket { .headers(self.client.auth_headers()) .send() .await?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + let resp = check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(resp.json().await?) } /// Updates visibility or CDN configuration for this bucket. - pub async fn update_settings(&self, params: UpdateBucketParams) -> Result<()> { + pub async fn update_settings(&self, params: &UpdateBucketParams) -> Result<()> { let resp = self .client .inner .client .put(format!("{}/settings", self.bucket_url())) .headers(self.client.auth_headers()) - .json(¶ms) + .json(params) .send() .await?; - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(()) } @@ -86,8 +87,8 @@ impl HFBucket { /// /// All `AddFile` operations are sent before `DeleteFile` operations, as required /// by the batch protocol. The input order within each group is preserved. - pub async fn batch_files(&self, ops: Vec) -> Result { - let (adds, deletes): (Vec<_>, Vec<_>) = ops.into_iter().partition(|op| matches!(op, BatchOp::AddFile(_))); + pub async fn batch_files(&self, ops: &[BatchOp]) -> Result { + let (adds, deletes): (Vec<_>, Vec<_>) = ops.iter().partition(|op| matches!(op, BatchOp::AddFile(_))); let ndjson = adds .iter() @@ -101,12 +102,12 @@ impl HFBucket { .client .post(format!("{}/batch", self.bucket_url())) .headers(self.client.auth_headers()) - .header("content-type", "application/x-ndjson") + .header(CONTENT_TYPE, "application/x-ndjson") .body(ndjson) .send() .await?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + let resp = check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(resp.json().await?) } @@ -115,13 +116,13 @@ impl HFBucket { /// Uses cursor-in-body pagination: the stream fetches the next page automatically /// when the current page's entries are exhausted. No request is made until the /// first item is polled. - pub fn list_tree(&self, path: &str, params: ListTreeParams) -> Result> + '_> { + pub fn list_tree(&self, path: &str, params: &ListTreeParams) -> Result> + '_> { let base_url = if path.is_empty() { - format!("{}/api/buckets/{}/{}/tree", self.client.inner.endpoint, self.namespace, self.repo) + format!("{}/api/buckets/{}/{}/tree", self.client.inner.endpoint, self.namespace, self.bucket) } else { - format!("{}/api/buckets/{}/{}/tree/{}", self.client.inner.endpoint, self.namespace, self.repo, path) + format!("{}/api/buckets/{}/{}/tree/{}", self.client.inner.endpoint, self.namespace, self.bucket, path) }; - let repo_id = self.repo_id(); + let bucket_name = self.bucket_name(); let mut initial_url = Url::parse(&base_url)?; { let mut qp = initial_url.query_pairs_mut(); @@ -138,7 +139,7 @@ impl HFBucket { (VecDeque::::new(), Some(initial_url), false), move |(mut pending, next_url, fetched)| { let client = self.client.clone(); - let repo_id = repo_id.clone(); + let bucket_name = bucket_name.clone(); async move { if let Some(entry) = pending.pop_front() { return Ok(Some((entry, (pending, next_url, fetched)))); @@ -153,7 +154,7 @@ impl HFBucket { }; let req = client.inner.client.get(url).headers(client.auth_headers()); let resp = req.send().await?; - let resp = check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + let resp = check_bucket_response(resp, &bucket_name, NotFoundContext::Bucket).await?; let next_cursor = parse_link_header_next(resp.headers()); let entries: Vec = resp.json().await?; @@ -171,10 +172,10 @@ impl HFBucket { /// Returns metadata for a batch of file paths. /// /// Paths that do not exist in the bucket are silently omitted from the result. - pub async fn get_paths_info(&self, paths: Vec) -> Result> { + pub async fn get_paths_info(&self, paths: &[String]) -> Result> { #[derive(serde::Serialize)] - struct Body { - paths: Vec, + struct Body<'a> { + paths: &'a [String], } let resp = self @@ -187,7 +188,7 @@ impl HFBucket { .send() .await?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Entry { path: String::new() }).await?; + let resp = check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(resp.json().await?) } @@ -202,7 +203,7 @@ impl HFBucket { .headers(self.client.auth_headers()) .send() .await?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + let resp = check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(resp.json().await?) } @@ -216,7 +217,7 @@ impl HFBucket { .headers(self.client.auth_headers()) .send() .await?; - let resp = check_bucket_response(resp, &self.repo_id(), NotFoundContext::Repo).await?; + let resp = check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Bucket).await?; Ok(resp.json().await?) } @@ -226,7 +227,7 @@ impl HFBucket { /// following it. Metadata is extracted from response headers: /// `X-Linked-Size`, `X-XET-Hash`, `X-Linked-ETag`, `Last-Modified`, and `Link`. pub async fn resolve_file(&self, path: &str) -> Result { - let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.repo, path); + let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.bucket, path); let resp = self .client .inner @@ -239,7 +240,7 @@ impl HFBucket { if !resp.status().is_redirection() { return Err(check_bucket_response( resp, - &self.repo_id(), + &self.bucket_name(), NotFoundContext::Entry { path: path.to_string() }, ) .await @@ -303,7 +304,7 @@ impl HFBucket { /// from the Xet CAS directly. #[cfg(feature = "xet")] pub async fn xet_resolve_file(&self, path: &str) -> Result { - let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.repo, path); + let url = format!("{}/buckets/{}/{}/resolve/{}", self.client.inner.endpoint, self.namespace, self.bucket, path); let resp = self .client .inner @@ -314,18 +315,18 @@ impl HFBucket { .send() .await?; let resp = - check_bucket_response(resp, &self.repo_id(), NotFoundContext::Entry { path: path.to_string() }).await?; + check_bucket_response(resp, &self.bucket_name(), NotFoundContext::Entry { path: path.to_string() }).await?; Ok(resp.json().await?) } } impl HFClient { /// Permanently deletes a bucket and all of its files. - pub async fn delete_bucket(&self, namespace: &str, repo: &str) -> Result<()> { - let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); - let repo_id = format!("{}/{}", namespace, repo); + pub async fn delete_bucket(&self, namespace: &str, bucket: &str) -> Result<()> { + let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, bucket); + let bucket_id = format!("{}/{}", namespace, bucket); let resp = self.inner.client.delete(&url).headers(self.auth_headers()).send().await?; - check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + check_bucket_response(resp, &bucket_id, NotFoundContext::Bucket).await?; Ok(()) } @@ -333,20 +334,20 @@ impl HFClient { pub async fn create_bucket( &self, namespace: &str, - repo: &str, - params: CreateBucketParams, + bucket: &str, + params: &CreateBucketParams, ) -> Result { - let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, repo); + let url = format!("{}/api/buckets/{}/{}", self.inner.endpoint, namespace, bucket); let resp = self .inner .client .post(&url) .headers(self.auth_headers()) - .json(¶ms) + .json(params) .send() .await?; - let repo_id = format!("{}/{}", namespace, repo); - let resp = check_bucket_response(resp, &repo_id, NotFoundContext::Repo).await?; + let bucket_id = format!("{}/{}", namespace, bucket); + let resp = check_bucket_response(resp, &bucket_id, NotFoundContext::Bucket).await?; Ok(resp.json().await?) } @@ -361,9 +362,9 @@ impl HFClient { sync_api! { impl HFBucket -> HFBucketSync { fn info(&self) -> Result; - fn update_settings(&self, params: UpdateBucketParams) -> Result<()>; - fn batch_files(&self, ops: Vec) -> Result; - fn get_paths_info(&self, paths: Vec) -> Result>; + fn update_settings(&self, params: &UpdateBucketParams) -> Result<()>; + fn batch_files(&self, ops: &[BatchOp]) -> Result; + fn get_paths_info(&self, paths: &[String]) -> Result>; fn get_xet_write_token(&self) -> Result; fn get_xet_read_token(&self) -> Result; fn resolve_file(&self, path: &str) -> Result; @@ -372,7 +373,7 @@ sync_api! { sync_api_stream! { impl HFBucket -> HFBucketSync { - fn list_tree(&self, path: &str, params: ListTreeParams) -> TreeEntry; + fn list_tree(&self, path: &str, params: &ListTreeParams) -> TreeEntry; } } @@ -385,8 +386,8 @@ sync_api! { sync_api! { impl HFClient -> HFClientSync { - fn delete_bucket(&self, namespace: &str, repo: &str) -> Result<()>; - fn create_bucket(&self, namespace: &str, repo: &str, params: CreateBucketParams) -> Result; + fn delete_bucket(&self, namespace: &str, bucket: &str) -> Result<()>; + fn create_bucket(&self, namespace: &str, bucket: &str, params: &CreateBucketParams) -> Result; } } @@ -401,18 +402,18 @@ mod tests { use crate::HFClientBuilder; #[test] - fn bucket_constructor_sets_namespace_and_repo() { + fn bucket_constructor_sets_namespace_and_bucket() { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); assert_eq!(bucket.namespace, "myuser"); - assert_eq!(bucket.repo, "my-bucket"); + assert_eq!(bucket.bucket, "my-bucket"); } #[test] fn get_bucket_url() { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); - let url = format!("{}/api/buckets/{}/{}", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + let url = format!("{}/api/buckets/{}/{}", bucket.client.inner.endpoint, bucket.namespace, bucket.bucket); assert!(url.ends_with("/api/buckets/myuser/my-bucket")); } @@ -420,7 +421,8 @@ mod tests { fn update_settings_url() { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); - let url = format!("{}/api/buckets/{}/{}/settings", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + let url = + format!("{}/api/buckets/{}/{}/settings", bucket.client.inner.endpoint, bucket.namespace, bucket.bucket); assert!(url.ends_with("/api/buckets/myuser/my-bucket/settings")); } @@ -490,11 +492,11 @@ mod tests { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); let url = if "".is_empty() { - format!("{}/api/buckets/{}/{}/tree", bucket.client.inner.endpoint, bucket.namespace, bucket.repo) + format!("{}/api/buckets/{}/{}/tree", bucket.client.inner.endpoint, bucket.namespace, bucket.bucket) } else { format!( "{}/api/buckets/{}/{}/tree/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "some/path" + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket, "some/path" ) }; assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree")); @@ -505,8 +507,10 @@ mod tests { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); let path = "data/sub"; - let url = - format!("{}/api/buckets/{}/{}/tree/{}", bucket.client.inner.endpoint, bucket.namespace, bucket.repo, path); + let url = format!( + "{}/api/buckets/{}/{}/tree/{}", + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket, path + ); assert!(url.ends_with("/api/buckets/myuser/my-bucket/tree/data/sub")); } @@ -516,10 +520,12 @@ mod tests { let bucket = client.bucket("myuser", "my-bucket"); let write_url = format!( "{}/api/buckets/{}/{}/xet-write-token", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket + ); + let read_url = format!( + "{}/api/buckets/{}/{}/xet-read-token", + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket ); - let read_url = - format!("{}/api/buckets/{}/{}/xet-read-token", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); assert!(write_url.ends_with("/xet-write-token")); assert!(read_url.ends_with("/xet-read-token")); } @@ -529,7 +535,7 @@ mod tests { let client = HFClientBuilder::new().build().unwrap(); let bucket = client.bucket("myuser", "my-bucket"); let url = - format!("{}/api/buckets/{}/{}/paths-info", bucket.client.inner.endpoint, bucket.namespace, bucket.repo); + format!("{}/api/buckets/{}/{}/paths-info", bucket.client.inner.endpoint, bucket.namespace, bucket.bucket); assert!(url.ends_with("/paths-info")); } @@ -560,7 +566,7 @@ mod tests { let bucket = client.bucket("myuser", "my-bucket"); let url = format!( "{}/buckets/{}/{}/resolve/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket, "data/train.parquet" ); assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); assert!(!url.contains("/api/")); @@ -573,7 +579,7 @@ mod tests { let bucket = client.bucket("myuser", "my-bucket"); let url = format!( "{}/buckets/{}/{}/resolve/{}", - bucket.client.inner.endpoint, bucket.namespace, bucket.repo, "data/train.parquet" + bucket.client.inner.endpoint, bucket.namespace, bucket.bucket, "data/train.parquet" ); assert!(url.contains("/buckets/myuser/my-bucket/resolve/data/train.parquet")); } diff --git a/huggingface_hub/src/bin/hfrs/main.rs b/huggingface_hub/src/bin/hfrs/main.rs index 39ba605..a91f815 100644 --- a/huggingface_hub/src/bin/hfrs/main.rs +++ b/huggingface_hub/src/bin/hfrs/main.rs @@ -139,6 +139,9 @@ fn format_hf_error(err: &HFError) -> String { HFError::RepoNotFound { repo_id } => { format!("Repository '{repo_id}' not found. If the repo is private, make sure you are authenticated.") }, + HFError::BucketNotFound { bucket_name } => { + format!("Bucket '{bucket_name}' not found. If the bucket is private, make sure you are authenticated.") + }, HFError::EntryNotFound { path, repo_id } => { format!("File '{path}' not found in repository '{repo_id}'.") }, diff --git a/huggingface_hub/src/buckets.rs b/huggingface_hub/src/buckets.rs index f4597d2..532f8de 100644 --- a/huggingface_hub/src/buckets.rs +++ b/huggingface_hub/src/buckets.rs @@ -23,7 +23,7 @@ pub struct HFBucket { /// The namespace (user or organization) that owns the bucket. pub namespace: String, /// The bucket name within the namespace. - pub repo: String, + pub bucket: String, } impl HFClient { @@ -34,7 +34,7 @@ impl HFClient { HFBucket { client: self.clone(), namespace: namespace.into(), - repo: repo.into(), + bucket: repo.into(), } } } diff --git a/huggingface_hub/src/client.rs b/huggingface_hub/src/client.rs index 891ce5d..f390c88 100644 --- a/huggingface_hub/src/client.rs +++ b/huggingface_hub/src/client.rs @@ -293,7 +293,7 @@ impl HFClient { revision, repo_id: repo_id_str, }), - crate::error::NotFoundContext::Generic => Err(HFError::Http { status, url, body }), + _ => Err(HFError::Http { status, url, body }), }, _ => Err(HFError::Http { status, url, body }), } diff --git a/huggingface_hub/src/error.rs b/huggingface_hub/src/error.rs index 297e085..9a708e3 100644 --- a/huggingface_hub/src/error.rs +++ b/huggingface_hub/src/error.rs @@ -15,6 +15,9 @@ pub enum HFError { #[error("Repository not found: {repo_id}")] RepoNotFound { repo_id: String }, + #[error("Repository not found: {bucket_name}")] + BucketNotFound { bucket_name: String }, + #[error("Revision not found: {revision} in {repo_id}")] RevisionNotFound { repo_id: String, revision: String }, @@ -115,6 +118,8 @@ pub type Result = std::result::Result; pub(crate) enum NotFoundContext { /// 404 means the repository does not exist Repo, + /// 404 means the bucket does not exist + Bucket, /// 404 means a file/path does not exist within the repo Entry { path: String }, /// 404 means the revision does not exist diff --git a/huggingface_hub/tests/integration_test.rs b/huggingface_hub/tests/integration_test.rs index 5cc72ae..1df0afa 100644 --- a/huggingface_hub/tests/integration_test.rs +++ b/huggingface_hub/tests/integration_test.rs @@ -9,7 +9,10 @@ //! Feature-gated tests: enable with --features, e.g.: //! HF_TOKEN=hf_xxx cargo test -p huggingface-hub --all-features --test integration_test -use futures::StreamExt; +use std::path::PathBuf; +use std::time::{SystemTime, UNIX_EPOCH}; + +use futures::{StreamExt, TryStreamExt}; use huggingface_hub::repository::{ HFRepository, RepoCreateBranchParams, RepoCreateCommitParams, RepoCreateTagParams, RepoDeleteBranchParams, RepoDeleteFileParams, RepoDeleteFolderParams, RepoDeleteTagParams, RepoDownloadFileParams, RepoFileExistsParams, @@ -21,6 +24,8 @@ use huggingface_hub::types::*; use huggingface_hub::{HFClient, HFClientBuilder}; #[cfg(feature = "spaces")] use huggingface_hub::{SpaceSecretDeleteParams, SpaceSecretParams, SpaceVariableDeleteParams, SpaceVariableParams}; +#[cfg(feature = "xet")] +use xet::xet_session::Sha256Policy; fn api() -> Option { if std::env::var("HF_TOKEN").is_err() { @@ -925,7 +930,7 @@ async fn test_create_and_delete_bucket() { let name = test_bucket_name(); let created = api - .create_bucket(username, &name, CreateBucketParams::builder().private(true).build()) + .create_bucket(username, &name, &CreateBucketParams::builder().private(true).build()) .await .expect("create_bucket failed"); assert!(created.name.contains(&name)); @@ -936,7 +941,7 @@ async fn test_create_and_delete_bucket() { assert!(info.private.unwrap()); bucket - .update_settings(UpdateBucketParams::builder().private(false).build()) + .update_settings(&UpdateBucketParams::builder().private(false).build()) .await .expect("update_settings failed"); @@ -957,14 +962,14 @@ async fn test_bucket_list_tree_empty() { let username = cached_username().await; let name = test_bucket_name(); - api.create_bucket(username, &name, CreateBucketParams::builder().build()) + api.create_bucket(username, &name, &CreateBucketParams::builder().build()) .await .expect("create_bucket failed"); let bucket = api.bucket(username, &name); let entries: Vec<_> = bucket - .list_tree("", ListTreeParams::builder().build()) + .list_tree("", &ListTreeParams::builder().build()) .unwrap() .collect::>() .await @@ -986,7 +991,7 @@ async fn test_get_xet_write_and_read_token() { let username = cached_username().await; let name = test_bucket_name(); - api.create_bucket(username, &name, CreateBucketParams::builder().build()) + api.create_bucket(username, &name, &CreateBucketParams::builder().build()) .await .unwrap();