diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/counter_map.rs b/turbopack/crates/turbo-tasks-backend/src/backend/counter_map.rs
index 7a5b8c09216f..a12a17b45fbc 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/counter_map.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/counter_map.rs
@@ -12,34 +12,37 @@ use bincode::{
 };
 use rustc_hash::FxHasher;
 
-type InnerMap<K, V> = AutoMap<K, V, BuildHasherDefault<FxHasher>, 1>;
+type InnerMap<K, V, const I: usize> = AutoMap<K, V, BuildHasherDefault<FxHasher>, I>;
 
 /// A map optimized for reference counting, backed by AutoMap.
 ///
 /// Entries are automatically removed when their count reaches zero.
 /// This provides memory-efficient storage for sparse counter data.
+///
+/// The `I` const generic forwards the inline capacity to the backing `AutoMap`
+/// — see the schema field-by-field sizing for the chosen values.
 #[derive(Debug, Clone)]
-pub struct CounterMap<K, V>(InnerMap<K, V>);
+pub struct CounterMap<K, V, const I: usize>(InnerMap<K, V, I>);
 
-impl<K, V> Default for CounterMap<K, V> {
+impl<K, V, const I: usize> Default for CounterMap<K, V, I> {
     fn default() -> Self {
         Self(InnerMap::default())
     }
 }
 
-impl<K: Eq + Hash, V: Eq> PartialEq for CounterMap<K, V> {
+impl<K: Eq + Hash, V: Eq, const I: usize> PartialEq for CounterMap<K, V, I> {
     fn eq(&self, other: &Self) -> bool {
         self.0 == other.0
     }
 }
 
-impl<K: Encode, V: Encode> Encode for CounterMap<K, V> {
+impl<K: Encode, V: Encode, const I: usize> Encode for CounterMap<K, V, I> {
     fn encode<E: Encoder>(&self, encoder: &mut E) -> Result<(), EncodeError> {
         self.0.encode(encoder)
     }
 }
 
-impl<Context, K, V> Decode<Context> for CounterMap<K, V>
+impl<Context, K, V, const I: usize> Decode<Context> for CounterMap<K, V, I>
 where
     K: Decode<Context> + Eq + Hash,
     V: Decode<Context>,
@@ -80,7 +83,7 @@ impl CounterValue for i32 {
     }
 }
 
-impl<K, V> CounterMap<K, V> {
+impl<K, V, const I: usize> CounterMap<K, V, I> {
     pub fn new() -> Self {
         Self(AutoMap::default())
     }
@@ -138,16 +141,16 @@ impl<K, V> CounterMap<K, V> {
     }
 }
 
-impl<K, V> IntoIterator for CounterMap<K, V> {
+impl<K, V, const I: usize> IntoIterator for CounterMap<K, V, I> {
     type Item = (K, V);
-    type IntoIter = <InnerMap<K, V> as IntoIterator>::IntoIter;
+    type IntoIter = <InnerMap<K, V, I> as IntoIterator>::IntoIter;
 
     fn into_iter(self) -> Self::IntoIter {
         self.0.into_iter()
     }
 }
 
-impl<K: Hash + Eq, V: CounterValue> CounterMap<K, V> {
+impl<K: Hash + Eq, V: CounterValue, const I: usize> CounterMap<K, V, I> {
     /// Insert a key-value pair. Panics if value is zero (invariant: zero values are not stored).
     pub fn insert(&mut self, key: K, value: V) -> Option<V> {
         debug_assert!(
@@ -297,7 +300,7 @@ mod tests {
 
     #[test]
     fn test_update_count_new_entry() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         // Adding new entry crosses zero (from nothing to something)
         assert!(map.update_count(1, 5));
         assert_eq!(map.get(&1), Some(&5));
@@ -305,7 +308,7 @@ mod tests {
 
     #[test]
     fn test_update_count_increment() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_count(1, 5);
         // Incrementing existing entry doesn't cross zero
         assert!(!map.update_count(1, 3));
@@ -314,7 +317,7 @@ mod tests {
 
     #[test]
     fn test_update_count_removal_on_zero() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         map.update_count(1, 5);
         // Subtracting to zero removes entry and crosses zero
         assert!(map.update_count(1, -5));
@@ -324,7 +327,7 @@ mod tests {
 
     #[test]
     fn test_update_count_zero_delta_on_empty() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         // Adding zero to non-existent entry doesn't create it
         assert!(!map.update_count(1, 0));
         assert!(map.is_empty());
@@ -332,14 +335,14 @@ mod tests {
 
     #[test]
     fn test_update_and_get_new_entry() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         assert_eq!(map.update_and_get(1, 5), 5);
         assert_eq!(map.get(&1), Some(&5));
     }
 
     #[test]
     fn test_update_and_get_increment() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_and_get(1, 5);
         assert_eq!(map.update_and_get(1, 3), 8);
         assert_eq!(map.get(&1), Some(&8));
@@ -347,7 +350,7 @@ mod tests {
 
     #[test]
     fn test_update_and_get_removal() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         map.update_and_get(1, 5);
         assert_eq!(map.update_and_get(1, -5), 0);
         assert!(map.is_empty());
@@ -355,7 +358,7 @@ mod tests {
 
     #[test]
     fn test_add_entry() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.add_entry(1, 10);
         assert_eq!(map.get(&1), Some(&10));
     }
@@ -363,14 +366,14 @@ mod tests {
     #[test]
     #[should_panic(expected = "Entry already exists")]
     fn test_add_entry_panics_on_duplicate() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.add_entry(1, 10);
         map.add_entry(1, 20); // Should panic
     }
 
     #[test]
     fn test_update_positive_crossing_new_positive() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         // From nothing to positive - crosses positive boundary
         assert!(map.update_positive_crossing(1, 5));
         assert_eq!(map.get(&1), Some(&5));
@@ -378,7 +381,7 @@ mod tests {
 
     #[test]
     fn test_update_positive_crossing_new_negative() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         // From nothing to negative - doesn't cross positive boundary
         assert!(!map.update_positive_crossing(1, -5));
         assert_eq!(map.get(&1), Some(&-5));
@@ -386,7 +389,7 @@ mod tests {
 
     #[test]
     fn test_update_positive_crossing_stay_positive() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         map.update_positive_crossing(1, 5);
         // Staying positive doesn't cross boundary
         assert!(!map.update_positive_crossing(1, 3));
@@ -395,7 +398,7 @@ mod tests {
 
     #[test]
     fn test_update_positive_crossing_to_non_positive() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         map.update_positive_crossing(1, 5);
         // Crossing to non-positive
         assert!(map.update_positive_crossing(1, -8));
@@ -404,7 +407,7 @@ mod tests {
 
     #[test]
     fn test_update_positive_crossing_to_zero_removes() {
-        let mut map: CounterMap<u32, i32> = CounterMap::new();
+        let mut map: CounterMap<u32, i32, 1> = CounterMap::new();
         map.update_positive_crossing(1, 5);
         // Crossing to zero removes and crosses boundary
         assert!(map.update_positive_crossing(1, -5));
@@ -413,14 +416,14 @@ mod tests {
 
     #[test]
     fn test_update_with_create() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_with(1, |_| Some(10));
         assert_eq!(map.get(&1), Some(&10));
     }
 
     #[test]
     fn test_update_with_modify() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_with(1, |_| Some(10));
         map.update_with(1, |v| v.map(|x| x + 5));
         assert_eq!(map.get(&1), Some(&15));
@@ -428,7 +431,7 @@ mod tests {
 
     #[test]
     fn test_update_with_remove() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_with(1, |_| Some(10));
         map.update_with(1, |_| None);
         assert!(map.is_empty());
@@ -436,14 +439,14 @@ mod tests {
 
     #[test]
     fn test_update_with_no_op() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_with(1, |_| None);
         assert!(map.is_empty());
     }
 
     #[test]
     fn test_len_and_is_empty() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         assert!(map.is_empty());
         assert_eq!(map.len(), 0);
 
@@ -457,7 +460,7 @@ mod tests {
 
     #[test]
     fn test_iter() {
-        let mut map: CounterMap<u32, u32> = CounterMap::new();
+        let mut map: CounterMap<u32, u32, 1> = CounterMap::new();
         map.update_count(1, 5);
         map.update_count(2, 10);
 
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs b/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
index c7f547215bcc..bd19c567b092 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/mod.rs
@@ -34,8 +34,8 @@ use turbo_tasks::{
     TaskId, TaskPersistence, TaskPriority, TraitTypeId, TurboTasksBackendApi, TurboTasksPanic,
     ValueTypeId,
     backend::{
-        Backend, CachedTaskType, CellContent, CellHash, TaskExecutionSpec, TransientTaskType,
-        TurboTaskContextError, TurboTaskLocalContextError, TurboTasksError,
+        Backend, CachedTaskType, CachedTaskTypeArc, CellContent, CellHash, TaskExecutionSpec,
+        TransientTaskType, TurboTaskContextError, TurboTaskLocalContextError, TurboTasksError,
         TurboTasksExecutionError, TurboTasksExecutionErrorMessage, TypedCellContent,
         VerificationMode,
     },
@@ -70,8 +70,8 @@ use crate::{
     },
     backing_storage::{BackingStorage, SnapshotItem, compute_task_type_hash},
     data::{
-        ActivenessState, CellRef, CollectibleRef, CollectiblesRef, Dirtyness, InProgressCellState,
-        InProgressState, InProgressStateInner, OutputValue, TransientTask,
+        ActivenessState, CellDependency, CellRef, CollectibleRef, CollectiblesRef, Dirtyness,
+        InProgressCellState, InProgressState, InProgressStateInner, OutputValue, TransientTask,
     },
     error::TaskError,
     utils::{
@@ -785,7 +785,8 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                 && (!task.immutable() || cfg!(feature = "verify_immutable"))
             {
                 let reader = reader.unwrap();
-                let _ = task.add_cell_dependents((cell, key, reader));
+                let _ = task
+                    .add_cell_dependents(CellDependency::new(CellRef { task: reader, cell }, key));
                 drop(task);
 
                 // Note: We use `task_pair` earlier to lock the task and its reader at the same
@@ -797,8 +798,9 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                     task: task_id,
                     cell,
                 };
-                if !reader_task.remove_outdated_cell_dependencies(&(target, key)) {
-                    let _ = reader_task.add_cell_dependencies((target, key));
+                let dep = CellDependency::new(target, key);
+                if !reader_task.remove_outdated_cell_dependencies(&dep) {
+                    let _ = reader_task.add_cell_dependencies(dep);
                 }
                 drop(reader_task);
             }
@@ -1526,7 +1528,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
                     // Only now do we force the allocation.
                     // NOTE: if our caller had to perform resolution, then this will have already
                     // been boxed and take_box just takes it.
-                    let task_type = Arc::new(CachedTaskType {
+                    let task_type = CachedTaskTypeArc::new(CachedTaskType {
                         native_fn,
                         this,
                         arg: arg.take_box(),
@@ -1757,7 +1759,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
         }
     }
 
-    fn debug_get_cached_task_type(&self, task_id: TaskId) -> Option<Arc<CachedTaskType>> {
+    fn debug_get_cached_task_type(&self, task_id: TaskId) -> Option<CachedTaskTypeArc> {
         let task = self.storage.access_mut(task_id);
         task.get_persistent_task_type().cloned()
     }
@@ -2197,7 +2199,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             Some(
                 // Collect all dependencies on tasks to check if all dependencies are immutable
                 task.iter_output_dependencies()
-                    .chain(task.iter_cell_dependencies().map(|(target, _key)| target.task))
+                    .chain(task.iter_cell_dependencies().map(|dep| dep.cell_ref().task))
                     .collect::<FxHashSet<_>>(),
             )
         } else {
@@ -2236,7 +2238,7 @@ impl<B: BackingStorage> TurboTasksBackendInner<B> {
             // breaking dependency tracking.
             old_edges.extend(
                 task.iter_outdated_cell_dependencies()
-                    .map(|(target, key)| OutdatedEdge::CellDependency(target, key)),
+                    .map(OutdatedEdge::CellDependency),
             );
             old_edges.extend(
                 task.iter_outdated_output_dependencies()
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/operation/cleanup_old_edges.rs b/turbopack/crates/turbo-tasks-backend/src/backend/operation/cleanup_old_edges.rs
index aa544b632789..699391e0b93e 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/operation/cleanup_old_edges.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/operation/cleanup_old_edges.rs
@@ -17,7 +17,7 @@ use crate::{
         },
         storage_schema::TaskStorageAccessors,
     },
-    data::{CellRef, CollectibleRef, CollectiblesRef},
+    data::{CellDependency, CellRef, CollectibleRef, CollectiblesRef},
 };
 
 #[derive(Encode, Decode, Clone)]
@@ -48,7 +48,7 @@ impl Default for CleanupOldEdgesOperation {
 pub enum OutdatedEdge {
     Child(TaskId),
     Collectible(CollectibleRef, i32),
-    CellDependency(CellRef, Option<u64>),
+    CellDependency(CellDependency),
     OutputDependency(TaskId),
     CollectiblesDependency(CollectiblesRef),
 }
@@ -166,27 +166,28 @@ impl CleanupOldEdgesOperation {
                                     AggregatedDataUpdate::new().collectibles_update(collectibles),
                                 ));
                             }
-                            OutdatedEdge::CellDependency(
-                                CellRef {
-                                    task: cell_task_id,
-                                    cell,
-                                },
-                                key,
-                            ) => {
+                            OutdatedEdge::CellDependency(dep) => {
+                                let (
+                                    CellRef {
+                                        task: cell_task_id,
+                                        cell,
+                                    },
+                                    key,
+                                ) = dep.into_parts();
                                 {
                                     let mut task = ctx.task(cell_task_id, TaskDataCategory::Data);
-                                    task.remove_cell_dependents(&(cell, key, task_id));
-                                }
-                                {
-                                    let mut task = ctx.task(task_id, TaskDataCategory::Data);
-                                    task.remove_cell_dependencies(&(
+                                    task.remove_cell_dependents(&CellDependency::new(
                                         CellRef {
-                                            task: cell_task_id,
+                                            task: task_id,
                                             cell,
                                         },
                                         key,
                                     ));
                                 }
+                                {
+                                    let mut task = ctx.task(task_id, TaskDataCategory::Data);
+                                    task.remove_cell_dependencies(&dep);
+                                }
                             }
                             OutdatedEdge::OutputDependency(output_task_id) => {
                                 #[cfg(feature = "trace_task_output_dependencies")]
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/operation/mod.rs b/turbopack/crates/turbo-tasks-backend/src/backend/operation/mod.rs
index 9f46eec53cb8..faab5a8f560d 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/operation/mod.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/operation/mod.rs
@@ -19,7 +19,7 @@ use tracing::info_span;
 use tracing::trace_span;
 use turbo_tasks::{
     CellId, DynTaskInputs, FxIndexMap, RawVc, SharedReference, TaskExecutionReason, TaskId,
-    TaskPriority, TurboTasksBackendApi, TurboTasksCallApi, backend::CachedTaskType,
+    TaskPriority, TurboTasksBackendApi, TurboTasksCallApi, backend::CachedTaskTypeArc,
     macro_helpers::NativeFunction,
 };
 
@@ -103,7 +103,8 @@ pub trait ExecuteContext<'e>: Sized {
     /// Uses hash-based lookup which may return multiple candidates due to hash collisions,
     /// then verifies each candidate by comparing the stored `persistent_task_type`.
     /// Returns `Some((task_id, task_type))` if a matching task is found, where `task_type` is
-    /// the existing `Arc<CachedTaskType>` from storage (avoiding a duplicate allocation).
+    /// the existing `CachedTaskTypeArc` from storage (avoiding a duplicate
+    /// allocation).
     ///
     /// Accepts exploded components so the caller does not need to box the argument before calling.
     fn task_by_type(
@@ -111,7 +112,7 @@ pub trait ExecuteContext<'e>: Sized {
         native_fn: &'static NativeFunction,
         this: Option<RawVc>,
         arg: &dyn DynTaskInputs,
-    ) -> Option<(TaskId, Arc<CachedTaskType>)>;
+    ) -> Option<(TaskId, CachedTaskTypeArc)>;
     fn debug_get_task_description(&self, task_id: TaskId) -> String;
 }
 
@@ -606,7 +607,7 @@ struct TaskRestoreEntry {
     /// Another thread claimed the meta restore; we must wait in Phase 3.
     wait_meta: bool,
     /// Task type discovered during Phase 1c data restore (used to update task cache in Phase 2).
-    task_type: Option<Arc<CachedTaskType>>,
+    task_type: Option<CachedTaskTypeArc>,
     /// This thread performed the restore for at least one category (set in Phase 1c).
     self_restored: bool,
 }
@@ -985,7 +986,7 @@ impl<'e, B: BackingStorage> ExecuteContext<'e> for ExecuteContextImpl<'e, B> {
         native_fn: &'static NativeFunction,
         this: Option<RawVc>,
         arg: &dyn DynTaskInputs,
-    ) -> Option<(TaskId, Arc<CachedTaskType>)> {
+    ) -> Option<(TaskId, CachedTaskTypeArc)> {
         if !self.backend.should_restore() {
             return None;
         }
@@ -1032,14 +1033,14 @@ impl<'e, B: BackingStorage> ChildExecuteContext<'e> for ChildExecuteContextImpl<
 }
 
 pub enum TaskTypeRef<'l> {
-    Cached(&'l Arc<CachedTaskType>),
+    Cached(&'l CachedTaskTypeArc),
     Transient(&'l Arc<TransientTask>),
 }
 
 impl TaskTypeRef<'_> {
     pub fn to_owned(&self) -> TaskType {
         match self {
-            TaskTypeRef::Cached(ty) => TaskType::Cached(Arc::clone(ty)),
+            TaskTypeRef::Cached(ty) => TaskType::Cached((*ty).clone()),
             TaskTypeRef::Transient(ty) => TaskType::Transient(Arc::clone(ty)),
         }
     }
@@ -1056,7 +1057,7 @@ impl Display for TaskTypeRef<'_> {
 
 #[derive(Debug)]
 pub enum TaskType {
-    Cached(Arc<CachedTaskType>),
+    Cached(CachedTaskTypeArc),
     Transient(Arc<TransientTask>),
 }
 
@@ -1398,7 +1399,7 @@ impl TaskGuard for TaskGuardImpl<'_> {
             .map(|target| (target, TaskDataCategory::Meta))
             .chain(
                 self.iter_cell_dependencies()
-                    .map(|(target, _key)| (target.task, TaskDataCategory::All)),
+                    .map(|dep| (dep.cell_ref().task, TaskDataCategory::All)),
             )
             .chain(
                 self.iter_collectibles_dependencies()
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/operation/update_cell.rs b/turbopack/crates/turbo-tasks-backend/src/backend/operation/update_cell.rs
index 22ba80932aec..74765a978a2a 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/operation/update_cell.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/operation/update_cell.rs
@@ -20,7 +20,7 @@ use crate::{
         },
         storage_schema::TaskStorageAccessors,
     },
-    data::CellRef,
+    data::{CellDependency, CellRef},
 };
 
 #[derive(Encode, Decode, Clone, Default)]
@@ -137,17 +137,22 @@ impl UpdateCellOperation {
             let mut dependent_tasks: FxIndexMap<TaskId, SmallVec<[Option<u64>; 2]>> =
                 FxIndexMap::default();
             if !skip_invalidation {
-                let tasks_with_keys =
-                    task.iter_cell_dependents()
-                        .filter_map(|(dependent_cell, key, task)| {
-                            (dependent_cell == cell
-                                && key.is_none_or(|key_hash| {
-                                    updated_key_hashes_set
-                                        .as_ref()
-                                        .is_none_or(|set| set.contains(&key_hash))
-                                }))
-                            .then_some((task, key))
-                        });
+                let tasks_with_keys = task.iter_cell_dependents().filter_map(|dep| {
+                    let (
+                        CellRef {
+                            task: dependent_task,
+                            cell: dependent_cell,
+                        },
+                        key,
+                    ) = dep.into_parts();
+                    (dependent_cell == cell
+                        && key.is_none_or(|key_hash| {
+                            updated_key_hashes_set
+                                .as_ref()
+                                .is_none_or(|set| set.contains(&key_hash))
+                        }))
+                    .then_some((dependent_task, key))
+                });
                 for (task, key) in tasks_with_keys {
                     dependent_tasks.entry(task).or_default().push(key);
                 }
@@ -276,14 +281,15 @@ impl Operation for UpdateCellOperation {
                         let mut make_stale = false;
                         let dependent = ctx.task(dependent_task_id, TaskDataCategory::All);
                         for key in keys.iter().copied() {
-                            if dependent.outdated_cell_dependencies_contains(&(cell_ref, key)) {
+                            let dep = CellDependency::new(cell_ref, key);
+                            if dependent.outdated_cell_dependencies_contains(&dep) {
                                 // cell dependency is outdated, so it hasn't read the cell yet
                                 // and doesn't need to be invalidated.
                                 // We do not need to make the task stale in this case.
                                 // But importantly we still need to make the task dirty as it should
                                 // no longer be considered as
                                 // "recomputation".
-                            } else if !dependent.cell_dependencies_contains(&(cell_ref, key)) {
+                            } else if !dependent.cell_dependencies_contains(&dep) {
                                 // cell dependency has been removed, so the task doesn't depend on
                                 // the cell anymore and doesn't need
                                 // to be invalidated
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs b/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
index 3120b8947c7c..0e5b13518e14 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/storage.rs
@@ -12,7 +12,7 @@ use std::{
 use thread_local::ThreadLocal;
 use tracing::span::Id;
 use turbo_bincode::TurboBincodeBuffer;
-use turbo_tasks::{FxDashMap, TaskId, backend::CachedTaskType, event::Event, parallel};
+use turbo_tasks::{FxDashMap, TaskId, backend::CachedTaskTypeArc, event::Event, parallel};
 
 use crate::{
     backend::storage_schema::{
@@ -183,7 +183,7 @@ pub struct Storage {
     /// This is backed by the TaskCache table in the database.
     ///
     /// LockOrdering: See the comments on [map].
-    pub task_cache: FxDashMap<Arc<CachedTaskType>, TaskId>,
+    pub task_cache: FxDashMap<CachedTaskTypeArc, TaskId>,
 }
 
 impl Storage {
@@ -253,7 +253,7 @@ impl Storage {
     /// Mark a newly allocated task as restored (skip DB queries) and new (include in persistence
     /// snapshots). Optionally sets the `persistent_task_type` eagerly so it's available for
     /// persistence snapshots without needing to propagate it through `connect_child`.
-    pub fn initialize_new_task(&self, task_id: TaskId, task_type: Option<Arc<CachedTaskType>>) {
+    pub fn initialize_new_task(&self, task_id: TaskId, task_type: Option<CachedTaskTypeArc>) {
         let mut task = self.access_mut(task_id);
         task.flags.set_restored(TaskDataCategory::All);
         task.flags.set_new_task(true);
@@ -516,7 +516,7 @@ impl Storage {
             // was contended. We defer them until after the map shard lock is released to
             // avoid a lock cycle with get_or_create_persistent_task, which takes task_cache
             // before map. Allocated lazily on first conflict.
-            let mut deferred_task_cache_removals: Vec<Arc<CachedTaskType>> = Vec::new();
+            let mut deferred_task_cache_removals: Vec<CachedTaskTypeArc> = Vec::new();
             // SAFETY: We hold the write lock for the duration of iteration.
             for bucket in unsafe { shard.iter() } {
                 // SAFETY: The write lock guard outlives the bucket reference.
diff --git a/turbopack/crates/turbo-tasks-backend/src/backend/storage_schema.rs b/turbopack/crates/turbo-tasks-backend/src/backend/storage_schema.rs
index 85db526d4d0c..288d1600092c 100644
--- a/turbopack/crates/turbo-tasks-backend/src/backend/storage_schema.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/backend/storage_schema.rs
@@ -21,8 +21,8 @@ use std::{hash::Hash, sync::Arc};
 
 use parking_lot::Mutex;
 use turbo_tasks::{
-    CellId, SharedReference, TaskExecutionReason, TaskId, TraitTypeId, ValueTypeId,
-    backend::{CachedTaskType, CellHash, TransientTaskType},
+    CellId, SharedReference, TaskExecutionReason, TaskId, TinyVec, TraitTypeId, ValueTypeId,
+    backend::{CachedTaskTypeArc, CellHash, TransientTaskType},
     event::Event,
     task_storage,
 };
@@ -30,18 +30,26 @@ use turbo_tasks::{
 use crate::{
     backend::{cell_data::CellData, counter_map::CounterMap},
     data::{
-        ActivenessState, AggregationNumber, CellRef, CollectibleRef, CollectiblesRef, Dirtyness,
-        InProgressCellState, InProgressState, LeafDistance, OutputValue, RootType, TransientTask,
+        ActivenessState, AggregationNumber, CellDependency, CollectibleRef, CollectiblesRef,
+        Dirtyness, InProgressCellState, InProgressState, LeafDistance, OutputValue, RootType,
+        TransientTask,
     },
 };
 
 /// Auto-set storage for small sets of keys with unit values.
-/// Optimized for small collections (< 8 items use SmallVec inline).
-type AutoSet<K> = auto_hash_map::AutoSet<K, std::hash::BuildHasherDefault<rustc_hash::FxHasher>, 1>;
+///
+/// The `I` const generic is the inline capacity — entries spill to a `HashSet`
+/// past it. Each field below picks its own `I` to saturate the available
+/// padding (in `LazyField`) or to stay within the 16-byte `SmallVec` free
+/// zone (for inline fields); see the field comments for the rationale.
+type AutoSet<K, const I: usize> =
+    auto_hash_map::AutoSet<K, std::hash::BuildHasherDefault<rustc_hash::FxHasher>, I>;
 
 /// Auto-map storage for key-value pairs.
-type AutoMap<K, V> =
-    auto_hash_map::AutoMap<K, V, std::hash::BuildHasherDefault<rustc_hash::FxHasher>, 1>;
+///
+/// See [`AutoSet`] for the meaning of `I`.
+type AutoMap<K, V, const I: usize> =
+    auto_hash_map::AutoMap<K, V, std::hash::BuildHasherDefault<rustc_hash::FxHasher>, I>;
 
 /// The complete task storage schema.
 ///
@@ -52,7 +60,7 @@ type AutoMap<K, V> =
 /// - `TaskFlags` bitfield for boolean flags
 /// - Accessor methods and traits
 ///
-/// Fields are stored lazily in `Vec<LazyField>` by default for memory efficiency.
+/// Fields are stored lazily in `TinyVec<LazyField>` by default for memory efficiency.
 /// Fields with `inline` are stored directly on TaskStorage (for hot-path access).
 ///
 /// Note: This struct is consumed by the macro and does not appear in the output.
@@ -80,7 +88,7 @@ struct TaskStorageSchema {
         filter_transient,
         drop_on_completion_if_immutable
     )]
-    output_dependent: AutoSet<TaskId>,
+    output_dependent: AutoSet<TaskId, 4>,
 
     /// The task's output value.
     /// Filtered during serialization to skip transient outputs (referencing transient tasks).
@@ -89,7 +97,7 @@ struct TaskStorageSchema {
 
     /// Upper nodes in the aggregation tree (reference counted).
     #[field(storage = "counter_map", category = "meta", inline, filter_transient)]
-    upper: CounterMap<TaskId, u32>,
+    upper: CounterMap<TaskId, u32, 2>,
 
     // =========================================================================
     // COLLECTIBLES (meta)
@@ -101,15 +109,15 @@ struct TaskStorageSchema {
         filter_transient,
         shrink_on_completion
     )]
-    collectibles: CounterMap<CollectibleRef, i32>,
+    collectibles: CounterMap<CollectibleRef, i32, 1>,
 
     /// Aggregated collectibles from the subgraph.
     #[field(storage = "counter_map", category = "meta", filter_transient)]
-    aggregated_collectibles: CounterMap<CollectibleRef, i32>,
+    aggregated_collectibles: CounterMap<CollectibleRef, i32, 1>,
 
     /// Outdated collectibles to be cleaned up (transient).
     #[field(storage = "counter_map", category = "transient", shrink_on_completion)]
-    outdated_collectibles: CounterMap<CollectibleRef, i32>,
+    outdated_collectibles: CounterMap<CollectibleRef, i32, 1>,
 
     // =========================================================================
     // STATE FIELDS (meta)
@@ -127,7 +135,7 @@ struct TaskStorageSchema {
 
     /// Individual dirty containers in the aggregated subgraph.
     #[field(storage = "counter_map", category = "meta", filter_transient)]
-    aggregated_dirty_containers: CounterMap<TaskId, i32>,
+    aggregated_dirty_containers: CounterMap<TaskId, i32, 3>,
 
     /// Count of clean containers in current session (transient).
     /// Absent = 0, present = actual count.
@@ -136,7 +144,7 @@ struct TaskStorageSchema {
 
     /// Individual clean containers in current session (transient).
     #[field(storage = "counter_map", category = "transient")]
-    aggregated_current_session_clean_containers: CounterMap<TaskId, i32>,
+    aggregated_current_session_clean_containers: CounterMap<TaskId, i32, 3>,
 
     // =========================================================================
     // FLAGS (meta) - Boolean flags stored in TaskFlags bitfield
@@ -223,11 +231,11 @@ struct TaskStorageSchema {
         filter_transient,
         shrink_on_completion
     )]
-    children: AutoSet<TaskId>,
+    children: AutoSet<TaskId, 6>,
 
     /// Follower nodes in the aggregation tree (reference counted).
     #[field(storage = "counter_map", category = "meta", filter_transient)]
-    followers: CounterMap<TaskId, u32>,
+    followers: CounterMap<TaskId, u32, 3>,
 
     // =========================================================================
     // DEPENDENCIES (data)
@@ -239,7 +247,7 @@ struct TaskStorageSchema {
         shrink_on_completion,
         drop_on_completion_if_immutable
     )]
-    output_dependencies: AutoSet<TaskId>,
+    output_dependencies: AutoSet<TaskId, 6>,
 
     /// Cells this task depends on.
     #[field(
@@ -249,7 +257,7 @@ struct TaskStorageSchema {
         shrink_on_completion,
         drop_on_completion_if_immutable
     )]
-    cell_dependencies: AutoSet<(CellRef, Option<u64>)>,
+    cell_dependencies: AutoSet<CellDependency, 1>,
 
     /// Collectibles this task depends on.
     #[field(
@@ -259,19 +267,19 @@ struct TaskStorageSchema {
         shrink_on_completion,
         drop_on_completion_if_immutable
     )]
-    collectibles_dependencies: AutoSet<CollectiblesRef>,
+    collectibles_dependencies: AutoSet<CollectiblesRef, 3>,
 
     /// Outdated output dependencies to be cleaned up (transient).
     #[field(storage = "auto_set", category = "transient", shrink_on_completion)]
-    outdated_output_dependencies: AutoSet<TaskId>,
+    outdated_output_dependencies: AutoSet<TaskId, 6>,
 
     /// Outdated cell dependencies to be cleaned up (transient).
     #[field(storage = "auto_set", category = "transient", shrink_on_completion)]
-    outdated_cell_dependencies: AutoSet<(CellRef, Option<u64>)>,
+    outdated_cell_dependencies: AutoSet<CellDependency, 1>,
 
     /// Outdated collectibles dependencies to be cleaned up (transient).
     #[field(storage = "auto_set", category = "transient", shrink_on_completion)]
-    outdated_collectibles_dependencies: AutoSet<CollectiblesRef>,
+    outdated_collectibles_dependencies: AutoSet<CollectiblesRef, 3>,
 
     // =========================================================================
     // DEPENDENTS - Tasks that depend on this task's cells
@@ -282,20 +290,20 @@ struct TaskStorageSchema {
         filter_transient,
         drop_on_completion_if_immutable
     )]
-    cell_dependents: AutoSet<(CellId, Option<u64>, TaskId)>,
+    cell_dependents: AutoSet<CellDependency, 1>,
 
     /// Tasks that depend on collectibles of a specific type from this task.
     /// Maps TraitTypeId -> Set<TaskId>
 
     #[field(storage = "auto_set", category = "meta", filter_transient)]
-    collectibles_dependents: AutoSet<(TraitTypeId, TaskId)>,
+    collectibles_dependents: AutoSet<(TraitTypeId, TaskId), 3>,
 
     #[field(
         storage = "auto_map",
         category = "data",
         shrink_on_completion,
         custom_drop_partial,
-        as_type = "AutoMap<CellId, SharedReference>"
+        as_type = "AutoMap<CellId, SharedReference, 1>"
     )]
     cell_data: CellData,
 
@@ -307,11 +315,11 @@ struct TaskStorageSchema {
     /// enum; a bare `u128` would grow the enum from 56 to 64 bytes due to its 16-byte
     /// alignment requirement.
     #[field(storage = "auto_map", category = "data", shrink_on_completion)]
-    cell_data_hash: AutoMap<CellId, CellHash>,
+    cell_data_hash: AutoMap<CellId, CellHash, 1>,
 
     /// Maximum cell index per cell type.
     #[field(storage = "auto_map", category = "data", shrink_on_completion)]
-    cell_type_max_index: AutoMap<ValueTypeId, u32>,
+    cell_type_max_index: AutoMap<ValueTypeId, u32, 3>,
 
     // =========================================================================
     // TRANSIENT EXECUTION STATE (transient)
@@ -326,10 +334,10 @@ struct TaskStorageSchema {
 
     /// In-progress cell state for cells being computed (transient).
     #[field(storage = "auto_map", category = "transient", shrink_on_completion)]
-    in_progress_cells: AutoMap<CellId, InProgressCellState>,
+    in_progress_cells: AutoMap<CellId, InProgressCellState, 1>,
 
     #[field(storage = "direct", category = "data", inline)]
-    pub persistent_task_type: Option<Arc<CachedTaskType>>,
+    pub persistent_task_type: Option<CachedTaskTypeArc>,
 
     #[field(storage = "direct", category = "transient")]
     pub transient_task_type: Arc<TransientTask>,
@@ -522,7 +530,7 @@ impl TaskStorage {
                 None => KeyEvictability::Unevictable,
                 // strong_count == 1: only this TaskStorage holds this Arc, so no task_cache entry
                 // references it. It must have been already evicted on a prior cycle.
-                Some(arc) if Arc::strong_count(arc) == 1 => KeyEvictability::AlreadyEvicted,
+                Some(arc) if arc.count() == 1 => KeyEvictability::AlreadyEvicted,
                 Some(_) => KeyEvictability::Evictable,
             }
         };
@@ -844,14 +852,9 @@ impl IsTransient for (TraitTypeId, TaskId) {
         self.1.is_transient()
     }
 }
-impl IsTransient for (CellId, Option<u64>, TaskId) {
-    fn is_transient(&self) -> bool {
-        self.2.is_transient()
-    }
-}
-impl IsTransient for (CellRef, Option<u64>) {
+impl IsTransient for CellDependency {
     fn is_transient(&self) -> bool {
-        self.0.task.is_transient()
+        CellDependency::is_transient(self)
     }
 }
 
@@ -863,7 +866,7 @@ pub(crate) trait MergeRestore {
     fn merge_restore(&mut self, items: impl IntoIterator<Item = Self::Item>);
 }
 
-impl<K, V> MergeRestore for CounterMap<K, V>
+impl<K, V, const I: usize> MergeRestore for CounterMap<K, V, I>
 where
     K: Eq + Hash,
 {
@@ -872,7 +875,7 @@ where
         self.extend(items)
     }
 }
-impl<V> MergeRestore for AutoSet<V>
+impl<V, const I: usize> MergeRestore for AutoSet<V, I>
 where
     V: Eq + Hash,
 {
@@ -915,7 +918,7 @@ impl<T: IsTransient> DropPartial for Option<T> {
     }
 }
 
-impl<T: IsTransient + Hash + Eq> DropPartial for AutoSet<T> {
+impl<T: IsTransient + Hash + Eq, const I: usize> DropPartial for AutoSet<T, I> {
     fn drop_partial(&mut self) -> DropPartialOutcome {
         self.retain(|t| t.is_transient());
         if self.is_empty() {
@@ -927,7 +930,7 @@ impl<T: IsTransient + Hash + Eq> DropPartial for AutoSet<T> {
     }
 }
 
-impl<K: IsTransient + Hash + Eq, V: Eq> DropPartial for CounterMap<K, V> {
+impl<K: IsTransient + Hash + Eq, V: Eq, const I: usize> DropPartial for CounterMap<K, V, I> {
     fn drop_partial(&mut self) -> DropPartialOutcome {
         self.retain(|k, _v| k.is_transient());
         if self.is_empty() {
@@ -938,7 +941,7 @@ impl<K: IsTransient + Hash + Eq, V: Eq> DropPartial for CounterMap<K, V> {
         }
     }
 }
-impl<K: IsTransient + Hash + Eq, V: IsTransient> DropPartial for AutoMap<K, V> {
+impl<K: IsTransient + Hash + Eq, V: IsTransient, const I: usize> DropPartial for AutoMap<K, V, I> {
     fn drop_partial(&mut self) -> DropPartialOutcome {
         self.retain(|k, v| k.is_transient() || v.is_transient());
         if self.is_empty() {
@@ -956,7 +959,7 @@ mod tests {
     use turbo_tasks::{CellId, TaskId};
 
     use super::*;
-    use crate::data::{AggregationNumber, CellRef, Dirtyness, OutputValue};
+    use crate::data::{AggregationNumber, CellDependency, CellRef, Dirtyness, OutputValue};
 
     #[test]
     fn test_accessors() {
@@ -1242,16 +1245,15 @@ mod tests {
         original
             .output_dependencies_mut()
             .insert(TaskId::new(200).unwrap());
-        original.cell_dependencies_mut().insert((
-            CellRef {
+        original
+            .cell_dependencies_mut()
+            .insert(CellDependency::All(CellRef {
                 task: TaskId::new(1).unwrap(),
                 cell: CellId {
                     type_id: unsafe { turbo_tasks::ValueTypeId::new_unchecked(1) },
                     index: 0,
                 },
-            },
-            None,
-        ));
+            }));
 
         // Set lazy data transient field (should NOT be serialized)
         original
@@ -1390,16 +1392,15 @@ mod tests {
         storage.output_dependent_mut().insert(transient_task(3));
 
         // Lazy filter_transient data field.
-        storage.cell_dependencies_mut().insert((
-            CellRef {
+        storage
+            .cell_dependencies_mut()
+            .insert(CellDependency::All(CellRef {
                 task: persistent_task(10),
                 cell: CellId {
                     type_id: unsafe { turbo_tasks::ValueTypeId::new_unchecked(1) },
                     index: 0,
                 },
-            },
-            None,
-        ));
+            }));
 
         // Mark as restored so the task is eligible for dropping.
         storage.flags.set_data_restored(true);
@@ -1705,13 +1706,14 @@ mod tests {
     fn test_schema_size() {
         assert_eq!(
             size_of::<TaskStorage>(),
-            136,
-            "TaskStorage size changed! If this is intentional, update this test."
+            128,
+            "TaskStorage size changed! Run print_schema_sizes and update this test."
         );
+        // `LazyField` is 48 B = 40 B largest payload + 8 B discriminant.
         assert_eq!(
             size_of::<LazyField>(),
-            56,
-            "LazyField size changed! If this is intentional, update this test."
+            48,
+            "LazyField size changed! Run print_schema_sizes and update this test."
         );
     }
 }
diff --git a/turbopack/crates/turbo-tasks-backend/src/data.rs b/turbopack/crates/turbo-tasks-backend/src/data.rs
index 0332e0e559d5..334f87ab0cf8 100644
--- a/turbopack/crates/turbo-tasks-backend/src/data.rs
+++ b/turbopack/crates/turbo-tasks-backend/src/data.rs
@@ -77,6 +77,60 @@ impl CollectiblesRef {
     }
 }
 
+/// An edge between a [`CellRef`] and a task, optionally narrowed by a hashed sub-key.
+///
+/// Used both as a forward and reverse edge:
+/// - In `cell_dependencies`, the [`CellRef`] is the cell another task owns that this task depends
+///   on.
+/// - In `cell_dependents`, the [`CellRef`]'s `task` is the dependent task and `cell` is the cell of
+///   the storing task; the `task` field is reused as the dependent's id rather than the cell's
+///   owning task. The fields encode the same bits either way.
+#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Encode, Decode)]
+pub enum CellDependency {
+    /// Depend on the cell as a whole.
+    All(CellRef),
+    /// Depend only on the sub-value identified by this hash key.
+    Hash(CellRef, u64),
+}
+
+impl CellDependency {
+    pub fn cell_ref(&self) -> CellRef {
+        match *self {
+            CellDependency::All(c) | CellDependency::Hash(c, _) => c,
+        }
+    }
+
+    pub fn key(&self) -> Option<u64> {
+        match *self {
+            CellDependency::All(_) => None,
+            CellDependency::Hash(_, k) => Some(k),
+        }
+    }
+
+    /// Decompose into the underlying `(CellRef, Option<u64>)` in a single match.
+    ///
+    /// Prefer this over back-to-back `cell_ref()` + `key()` calls — the discriminant is
+    /// checked once instead of twice, which is meaningful in hot loops over
+    /// `iter_cell_dependents` / `iter_cell_dependencies`.
+    pub fn into_parts(self) -> (CellRef, Option<u64>) {
+        match self {
+            CellDependency::All(c) => (c, None),
+            CellDependency::Hash(c, k) => (c, Some(k)),
+        }
+    }
+
+    pub fn new(cell_ref: CellRef, key: Option<u64>) -> Self {
+        match key {
+            None => CellDependency::All(cell_ref),
+            Some(k) => CellDependency::Hash(cell_ref, k),
+        }
+    }
+
+    pub fn is_transient(&self) -> bool {
+        self.cell_ref().is_transient()
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, Encode, Decode)]
 pub enum OutputValue {
     Cell(CellRef),
diff --git a/turbopack/crates/turbo-tasks-macros/src/derive/task_storage_macro.rs b/turbopack/crates/turbo-tasks-macros/src/derive/task_storage_macro.rs
index 799ed85c7627..27268b60f830 100644
--- a/turbopack/crates/turbo-tasks-macros/src/derive/task_storage_macro.rs
+++ b/turbopack/crates/turbo-tasks-macros/src/derive/task_storage_macro.rs
@@ -1296,12 +1296,21 @@ fn generate_typed_storage_struct(grouped_fields: &GroupedFields) -> TokenStream
         quote! {}
     };
 
-    // Add lazy vec field if needed (pub(crate) - used by helper methods)
-    // Note: Serialization is handled manually via encode_data/encode_meta methods
     let lazy_field = if has_lazy {
+        // `TinyVec`'s `MAX` const generic is set to the exact number of lazy fields declared
+        // in the schema. This caps growth at the smallest power-of-two-or-MAX boundary
+        // (e.g. with 24 variants we end at cap=24 instead of cap=32), saving a few slots
+        // per fully-populated task. It also makes "push past MAX" a compile-time-bounded
+        // contract instead of relying on `u8::MAX`.
+        //
+        // `as u8` cast is safe at the macro layer: u8::MAX is plenty of room for any
+        // realistic schema (asserted at compile time by `TinyVec::new`'s `MAX > 0` guard
+        // — a runtime check is not strictly required because the macro itself wouldn't
+        // emit > 255 variants).
+        let max_lazy = grouped_fields.all_lazy().count() as u8;
         quote! {
-            #[doc = "Lazily-allocated fields stored in a single Vec for memory efficiency"]
-            lazy: Vec<LazyField>,
+            #[doc = "Lazily-allocated fields stored in a compact TinyVec for memory efficiency"]
+            lazy: TinyVec<LazyField, #max_lazy>,
         }
     } else {
         quote! {}
@@ -3686,10 +3695,10 @@ fn generate_snapshot_restore_methods(grouped_fields: &GroupedFields) -> TokenStr
 
                 #clone_all_flags
 
-                // Pre-allocate lazy vec (upper bound - some may be transient and skipped)
-                snapshot.lazy.reserve(self.lazy.len());
-
-                // Clone all persistent lazy fields (both meta and data)
+                // Clone all persistent lazy fields (both meta and data).
+                // (No pre-`reserve`: the schema has ≤24 lazy fields, so at most 3 grows
+                // (0→4→8→16→24) total — cheaper than complicating the public API surface
+                // of `TinyVec`.)
                 for field in &self.lazy {
                     match field {
                         #(#clone_data_lazy_arms)*
@@ -3746,7 +3755,7 @@ fn generate_snapshot_restore_methods(grouped_fields: &GroupedFields) -> TokenStr
                 // and merge each source variant in O(1).
                 let (any_meta, _any_data, index) = Self::build_lazy_index(&self.lazy);
                 if !any_meta {
-                    self.lazy.extend(source.lazy);
+                    self.lazy.extend_exact(source.lazy);
                 } else {
                     for field in source.lazy {
                         debug_assert!(field.is_persistent() && field.is_meta());
@@ -3770,7 +3779,7 @@ fn generate_snapshot_restore_methods(grouped_fields: &GroupedFields) -> TokenStr
                 // in `self.lazy` is never a collision risk.
                 let (_any_meta, any_data, index) = Self::build_lazy_index(&self.lazy);
                 if !any_data {
-                    self.lazy.extend(source.lazy);
+                    self.lazy.extend_exact(source.lazy);
                 } else {
                     for field in source.lazy {
                         debug_assert!(field.is_persistent() && field.is_data());
diff --git a/turbopack/crates/turbo-tasks/Cargo.toml b/turbopack/crates/turbo-tasks/Cargo.toml
index 6f1733067135..619576daf694 100644
--- a/turbopack/crates/turbo-tasks/Cargo.toml
+++ b/turbopack/crates/turbo-tasks/Cargo.toml
@@ -65,3 +65,7 @@ criterion = { workspace = true, features = ["async_tokio"] }
 [[bench]]
 name = "mod"
 harness = false
+
+[[bench]]
+name = "tiny_vec"
+harness = false
diff --git a/turbopack/crates/turbo-tasks/benches/tiny_vec.rs b/turbopack/crates/turbo-tasks/benches/tiny_vec.rs
new file mode 100644
index 000000000000..02124e141168
--- /dev/null
+++ b/turbopack/crates/turbo-tasks/benches/tiny_vec.rs
@@ -0,0 +1,122 @@
+//! Direct comparison between `TinyVec<T>` and the standard `Vec<T>` on the
+//! operations that `TaskStorage::lazy` actually exercises:
+//!
+//!   * `push` — appending one element at a time, growing through capacity boundaries.
+//!   * `iter` — linear scan (this is how `find_lazy(id)` works under the hood).
+//!
+//! These are micro-benchmarks: the values pushed are small `(u8, u64)` pairs to
+//! mimic `LazyField`'s ~48 B size without dragging in the entire schema. The
+//! goal is to validate that switching `lazy` from `Vec` to `TinyVec` doesn't
+//! cost throughput at the API level, since `Vec::push` is heavily optimized
+//! and our hand-rolled `TinyVec::push` is not.
+
+use std::hint::black_box;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use turbo_tasks::TinyVec;
+
+#[global_allocator]
+static ALLOC: turbo_tasks_malloc::TurboMalloc = turbo_tasks_malloc::TurboMalloc;
+
+/// A stand-in for `LazyField`: 16 B payload so the per-element work mirrors the
+/// real storage layout (without needing the whole schema).
+#[derive(Clone, Copy)]
+#[allow(dead_code)]
+struct Item {
+    tag: u64,
+    payload: u64,
+}
+
+fn make_item(i: u64) -> Item {
+    Item {
+        tag: i,
+        payload: i.wrapping_mul(0x9E37_79B9_7F4A_7C15),
+    }
+}
+
+/// Push N items into a fresh `Vec`. Returns the populated container so the
+/// allocator drop cost is included in the measurement.
+fn push_vec(n: usize) -> Vec<Item> {
+    let mut v: Vec<Item> = Vec::new();
+    for i in 0..n {
+        v.push(make_item(i as u64));
+    }
+    v
+}
+
+fn push_tinyvec(n: usize) -> TinyVec<Item> {
+    let mut v: TinyVec<Item> = TinyVec::default();
+    for i in 0..n {
+        v.push(make_item(i as u64));
+    }
+    v
+}
+
+/// Sum all items via iter — the linear scan pattern.
+#[allow(clippy::ptr_arg)] // for clarity
+fn iter_vec(v: &Vec<Item>) -> u64 {
+    let mut acc: u64 = 0;
+    for it in v.iter() {
+        acc = acc.wrapping_add(it.tag).wrapping_add(it.payload);
+    }
+    acc
+}
+
+fn iter_tinyvec(v: &TinyVec<Item>) -> u64 {
+    let mut acc: u64 = 0;
+    for it in v.iter() {
+        acc = acc.wrapping_add(it.tag).wrapping_add(it.payload);
+    }
+    acc
+}
+
+pub fn bench(c: &mut Criterion) {
+    // Sizes chosen to cover the realistic `TaskStorage::lazy` range:
+    //   0   — empty (steady state for many tasks)
+    //   1   — single field set (very common)
+    //   4   — Vec's first grow boundary (1 -> 2 -> 4 -> 8 ...)
+    //   8   — past the first few grows, full cache line worth of items
+    //   16  — fits in our `u8` cap with headroom
+    //   24  — close to the realistic max (~25 lazy fields in the schema)
+    let sizes = [0usize, 1, 4, 8, 16, 24];
+
+    // --- push -----------------------------------------------------------------
+
+    let mut group = c.benchmark_group("tiny_vec/push");
+    group.sample_size(200);
+    for &n in &sizes {
+        group.bench_with_input(BenchmarkId::new("Vec", n), &n, |b, &n| {
+            b.iter(|| {
+                let v = push_vec(black_box(n));
+                black_box(v);
+            });
+        });
+        group.bench_with_input(BenchmarkId::new("TinyVec", n), &n, |b, &n| {
+            b.iter(|| {
+                let v = push_tinyvec(black_box(n));
+                black_box(v);
+            });
+        });
+    }
+    group.finish();
+
+    // --- iter -----------------------------------------------------------------
+
+    let mut group = c.benchmark_group("tiny_vec/iter");
+    group.sample_size(200);
+    for &n in &sizes {
+        // Pre-fill once outside the timed region.
+        let v: Vec<Item> = push_vec(n);
+        let tv: TinyVec<Item> = push_tinyvec(n);
+        group.bench_with_input(BenchmarkId::new("Vec", n), &n, |b, _| {
+            b.iter(|| black_box(iter_vec(black_box(&v))));
+        });
+        group.bench_with_input(BenchmarkId::new("TinyVec", n), &n, |b, _| {
+            b.iter(|| black_box(iter_tinyvec(black_box(&tv))));
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(tiny_vec_benches, bench);
+criterion_main!(tiny_vec_benches);
diff --git a/turbopack/crates/turbo-tasks/src/backend.rs b/turbopack/crates/turbo-tasks/src/backend.rs
index d9a1e12325c8..88e1cc8d4180 100644
--- a/turbopack/crates/turbo-tasks/src/backend.rs
+++ b/turbopack/crates/turbo-tasks/src/backend.rs
@@ -1,9 +1,10 @@
 use std::{
-    borrow::Cow,
+    borrow::{Borrow, Cow},
     error::Error,
     fmt::{self, Debug, Display},
     future::Future,
     hash::{BuildHasher, BuildHasherDefault, Hash},
+    ops::Deref,
     pin::Pin,
     sync::Arc,
 };
@@ -131,6 +132,76 @@ impl_encode_for_turbo_bincode_encode!(CachedTaskType);
 impl_decode_for_turbo_bincode_decode!(CachedTaskType);
 impl_borrow_decode!(CachedTaskType);
 
+/// A reference-counted pointer to a [`CachedTaskType`] using `triomphe::Arc`.
+///
+/// `triomphe::Arc` saves one `usize` per allocation (no weak count) and avoids the weak-count
+/// CAS in `drop_slow` compared to `std::sync::Arc`. We never need `Weak<CachedTaskType>`, so
+/// the trade-off is favorable.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct CachedTaskTypeArc(pub triomphe::Arc<CachedTaskType>);
+
+impl CachedTaskTypeArc {
+    pub fn new(value: CachedTaskType) -> Self {
+        Self(triomphe::Arc::new(value))
+    }
+
+    pub fn count(&self) -> usize {
+        triomphe::Arc::count(&self.0)
+    }
+}
+
+impl AsRef<CachedTaskType> for CachedTaskTypeArc {
+    fn as_ref(&self) -> &CachedTaskType {
+        &self.0
+    }
+}
+
+impl Deref for CachedTaskTypeArc {
+    type Target = CachedTaskType;
+    #[inline]
+    fn deref(&self) -> &CachedTaskType {
+        &self.0
+    }
+}
+
+impl Borrow<CachedTaskType> for CachedTaskTypeArc {
+    #[inline]
+    fn borrow(&self) -> &CachedTaskType {
+        &self.0
+    }
+}
+
+impl Display for CachedTaskTypeArc {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        Display::fmt(&**self, f)
+    }
+}
+
+impl Encode for CachedTaskTypeArc {
+    fn encode<E: Encoder>(&self, encoder: &mut E) -> Result<(), EncodeError> {
+        <CachedTaskType as Encode>::encode(self, encoder)
+    }
+}
+
+impl<Context> Decode<Context> for CachedTaskTypeArc {
+    fn decode<D: Decoder<Context = Context>>(decoder: &mut D) -> Result<Self, DecodeError> {
+        Ok(Self::new(<CachedTaskType as Decode<Context>>::decode(
+            decoder,
+        )?))
+    }
+}
+
+impl<'de, Context> bincode::BorrowDecode<'de, Context> for CachedTaskTypeArc {
+    fn borrow_decode<D: bincode::de::BorrowDecoder<'de, Context = Context>>(
+        decoder: &mut D,
+    ) -> Result<Self, DecodeError> {
+        Ok(Self::new(<CachedTaskType as bincode::BorrowDecode<
+            'de,
+            Context,
+        >>::borrow_decode(decoder)?))
+    }
+}
+
 // Manual implementation is needed because of a borrow issue with `Box<dyn Trait>`:
 // https://github.com/rust-lang/rust/issues/31740
 impl PartialEq for CachedTaskType {
diff --git a/turbopack/crates/turbo-tasks/src/lib.rs b/turbopack/crates/turbo-tasks/src/lib.rs
index a84f4c038924..4bbef82bef5d 100644
--- a/turbopack/crates/turbo-tasks/src/lib.rs
+++ b/turbopack/crates/turbo-tasks/src/lib.rs
@@ -55,6 +55,7 @@ mod state;
 pub mod task;
 mod task_execution_reason;
 pub mod task_statistics;
+mod tiny_vec;
 pub mod trace;
 mod trait_ref;
 mod triomphe_utils;
@@ -108,6 +109,7 @@ pub use crate::{
         task_input::{EitherTaskInput, TaskInput},
     },
     task_execution_reason::TaskExecutionReason,
+    tiny_vec::TinyVec,
     trait_ref::TraitRef,
     value::{TransientInstance, TransientValue},
     value_type::{Evictability, TraitMethod, TraitType, ValueType, ValueTypePersistence},
diff --git a/turbopack/crates/turbo-tasks/src/tiny_vec.rs b/turbopack/crates/turbo-tasks/src/tiny_vec.rs
new file mode 100644
index 000000000000..7f30bc7ade8a
--- /dev/null
+++ b/turbopack/crates/turbo-tasks/src/tiny_vec.rs
@@ -0,0 +1,671 @@
+//! A `Vec`-shaped container with `u8` length and capacity, sized 16 B on 64-bit instead of 24 B.
+//!
+//! Used by `#[task_storage]` for `TaskStorage`'s lazy-fields field, which holds at most ~25
+//! elements (one per declared lazy field in the schema). With several million task storages
+//! live during a typical Next.js build, the 8 B saved per task adds up to dozens of MB of
+//! resident memory.
+//!
+//! The API is intentionally a strict subset of `Vec` covering only what the task-storage
+//! callers and the `#[task_storage]` macro emit need: `len`, `iter`, `iter_mut`, `push`,
+//! `swap_remove`, `last_mut`, `index`, `index_mut`, `extend`, `reserve`, `retain_mut`,
+//! `Default`, `Debug`, `ShrinkToFit`. No `Clone` or `PartialEq` — `TaskStorage` doesn't
+//! derive them.
+//!
+//! ## Capacity
+//!
+//! `TinyVec<T, MAX>` is statically capped at `MAX <= 255` elements. Pushing past `MAX`
+//! panics. Growth doubles until it would exceed `MAX`, then caps at exactly `MAX`. The
+//! default `MAX = 255` covers any container that fits the type's `u8` cap.
+//!
+//! For `TaskStorage::lazy` the schema emits `TinyVec<LazyField, 25>`, which tightens the
+//! steady-state allocation: a fully-populated lazy vec ends at cap=25 instead of cap=32
+//! (the next power of two), saving 7 slots × `size_of::<LazyField>()` ≈ 336 B per such
+//! task.
+
+use std::{
+    alloc::{Layout, alloc, dealloc, handle_alloc_error},
+    fmt,
+    marker::PhantomData,
+    mem::ManuallyDrop,
+    ptr::{self, NonNull},
+};
+
+/// Compact `Vec`-shaped container with a statically-bounded capacity; see module docs for
+/// rationale. `MAX` defaults to `u8::MAX = 255` (the largest value the `u8` cap field can hold).
+pub struct TinyVec<T, const MAX: u8 = { u8::MAX }> {
+    /// Heap pointer. Dangling (uninitialized) when `cap == 0`.
+    ptr: NonNull<T>,
+    len: u8,
+    cap: u8,
+    /// Marker so we own `T` for drop-check purposes (matches `Vec<T>`'s variance/dropck).
+    _marker: PhantomData<T>,
+}
+
+// SAFETY: same as `Vec<T>` — we own a heap allocation of `T`s, and the only shared state is via
+// the `ptr` which is unique to this `TinyVec`.
+unsafe impl<T: Send, const MAX: u8> Send for TinyVec<T, MAX> {}
+unsafe impl<T: Sync, const MAX: u8> Sync for TinyVec<T, MAX> {}
+
+impl<T, const MAX: u8> Default for TinyVec<T, MAX> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T, const MAX: u8> TinyVec<T, MAX> {
+    // Compile-time assertion that `MAX > 0`. Referenced inside `new()` so it gets evaluated
+    // at monomorphization time; the panic message becomes a compile error for any
+    // `TinyVec<T, 0>` instantiation rather than a runtime panic on the first call.
+    const _ASSERT_MAX_NONZERO: () = assert!(MAX > 0, "TinyVec MAX must be > 0");
+
+    const fn new() -> Self {
+        // Force evaluation of the static assertion at this generic's monomorphization.
+        // The `let` binding to `()` keeps the const visited; clippy's `let_unit_value` lint
+        // is allowed here because that's intentional.
+        #[allow(clippy::let_unit_value)]
+        let _: () = Self::_ASSERT_MAX_NONZERO;
+        Self {
+            ptr: NonNull::dangling(),
+            len: 0,
+            cap: 0,
+            _marker: PhantomData,
+        }
+    }
+
+    /// Retains only the elements for which the predicate returns `true`. See
+    /// [`Vec::retain_mut`] for semantics including panic safety.
+    ///
+    /// Delegates to `Vec::retain_mut`. Implementing retain_mut directly requires a
+    /// panic-safe partial-shift dance that's the trickiest unsafe code in this module; the
+    /// `Vec` version is identical in shape but has been hand-tested in the standard
+    /// library. Round-tripping through `Vec` for this one operation is worth the soundness
+    /// improvement, especially since `retain_mut` is cold relative to `push`.
+    pub fn retain_mut(&mut self, f: impl FnMut(&mut T) -> bool) {
+        if self.len == 0 {
+            return;
+        }
+
+        // Panic safety: transfer buffer ownership to the local `Vec` *before* the closure
+        // can panic. Zeroing `cap` first means our `Drop` becomes a no-op until we restore
+        // it below — if `f` panics, `vec`'s Drop frees the buffer exactly once and our
+        // Drop (which may run during continued unwinding) does nothing.
+        let ptr = self.ptr.as_ptr();
+        let len = self.len as usize;
+        let cap = self.cap as usize;
+        self.cap = 0;
+        self.len = 0;
+
+        // SAFETY: by struct invariant, `(ptr, len, cap)` is a valid `Vec::from_raw_parts`
+        // triple.
+        let mut vec = unsafe { Vec::from_raw_parts(ptr, len, cap) };
+        vec.retain_mut(f);
+
+        // No panic. Take ownership of the (possibly element-dropped) buffer back.
+        // `retain_mut` never grows, so `new_cap == cap`.
+        let (new_ptr, new_len, new_cap) = vec.into_raw_parts();
+        debug_assert_eq!(new_cap, cap);
+        // SAFETY: `Vec::into_raw_parts` returns a non-null pointer; same buffer as on entry.
+        self.ptr = unsafe { NonNull::new_unchecked(new_ptr) };
+        self.len = new_len as u8;
+        self.cap = new_cap as u8;
+    }
+
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len as usize
+    }
+
+    /// Pair to [`len`] (kept inherent so clippy's `len_without_is_empty` lint is satisfied;
+    /// it's also reachable through `Deref<[T]>::is_empty`).
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    // `capacity` is exposed only to tests; external callers don't need it.
+    #[cfg(test)]
+    #[inline]
+    fn capacity(&self) -> usize {
+        self.cap as usize
+    }
+
+    // `iter`, `iter_mut`, `last_mut`, indexing, and `.is_empty()` slice-style usage are
+    // reachable through `Deref`/`DerefMut` to `[T]`. No need for inherent methods.
+
+    #[inline]
+    fn as_slice(&self) -> &[T] {
+        // SAFETY: ptr is valid for `len` initialized elements; if len == 0, slicing the
+        // dangling pointer is allowed by `from_raw_parts`.
+        unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len()) }
+    }
+
+    #[inline]
+    fn as_mut_slice(&mut self) -> &mut [T] {
+        // SAFETY: same as `as_slice`; we hold `&mut self`.
+        unsafe { std::slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len()) }
+    }
+
+    /// Appends `value`. Panics if `len == MAX`.
+    pub fn push(&mut self, value: T) {
+        if self.len == self.cap {
+            // grow_by_one asserts inside realloc_to when new_cap > MAX. The check below
+            // happens before the cold-path call so we panic with a clearer message when the
+            // container is already saturated.
+            assert!(
+                (self.len as usize) < MAX as usize,
+                "TinyVec capacity overflow: already at MAX = {MAX}",
+            );
+            self.grow_by_one();
+        }
+        // SAFETY: `len < cap` after the grow; the slot at index `len` is uninitialized and we
+        // initialize it here.
+        unsafe {
+            ptr::write(self.ptr.as_ptr().add(self.len()), value);
+        }
+        self.len += 1;
+    }
+
+    /// Removes the element at `idx` by swapping it with the last and popping. O(1).
+    /// Panics if `idx` is out of bounds (matching `Vec::swap_remove`).
+    pub fn swap_remove(&mut self, idx: usize) -> T {
+        let len = self.len();
+        assert!(idx < len, "swap_remove index out of bounds: {idx} >= {len}");
+        // SAFETY: `idx < len`; we read out the value and then either swap or shrink.
+        unsafe {
+            let last = self.ptr.as_ptr().add(len - 1);
+            let hole = self.ptr.as_ptr().add(idx);
+            let value = ptr::read(hole);
+            if idx != len - 1 {
+                ptr::copy_nonoverlapping(last, hole, 1);
+            }
+            self.len -= 1;
+            value
+        }
+    }
+
+    /// Reserves capacity for at least `additional` more elements. No-op if already sufficient.
+    /// Panics if the resulting capacity would exceed `MAX`.
+    ///
+    /// Private: used by `extend_exact` internally; no external callers.
+    fn reserve(&mut self, additional: usize) {
+        let needed = self.len() + additional;
+        if needed <= self.cap as usize {
+            return;
+        }
+        // Round up to next power of two (min 4), but never exceed MAX.
+        let target = needed.next_power_of_two().max(4).min(MAX as usize);
+        self.realloc_to(target);
+    }
+
+    /// Grow the buffer by at least one slot. The first allocation jumps to 4 to amortize the
+    /// initial pushes; subsequent growths double, capped at `MAX`.
+    #[cold]
+    #[inline(never)]
+    fn grow_by_one(&mut self) {
+        let doubled = if self.cap == 0 {
+            4
+        } else {
+            (self.cap as usize) * 2
+        };
+        let new_cap = doubled.min(MAX as usize);
+        self.realloc_to(new_cap);
+    }
+
+    fn realloc_to(&mut self, new_cap: usize) {
+        assert!(
+            new_cap <= MAX as usize,
+            "TinyVec capacity overflow: requested {new_cap}, max {MAX}",
+        );
+        if new_cap == self.cap as usize {
+            return;
+        }
+        if size_of::<T>() == 0 {
+            // Zero-sized types: no allocation needed; just bump cap.
+            self.cap = new_cap as u8;
+            return;
+        }
+
+        // Allocate new buffer.
+        let new_layout = Layout::array::<T>(new_cap).expect("TinyVec layout overflow");
+        // SAFETY: Layout has nonzero size because new_cap > 0 (or we'd not be here) and T is
+        // nonzero-sized (handled above).
+        let new_ptr = unsafe { alloc(new_layout) } as *mut T;
+        let new_ptr = match NonNull::new(new_ptr) {
+            Some(p) => p,
+            None => handle_alloc_error(new_layout),
+        };
+
+        // Move elements over.
+        if self.cap > 0 {
+            // SAFETY: old buffer holds `len` initialized Ts; copy them to the new buffer's
+            // prefix (which is uninitialized).
+            unsafe {
+                ptr::copy_nonoverlapping(self.ptr.as_ptr(), new_ptr.as_ptr(), self.len());
+            }
+            self.deallocate_old();
+        }
+
+        self.ptr = new_ptr;
+        self.cap = new_cap as u8;
+    }
+
+    /// Deallocates the current heap buffer without dropping the elements (caller must have
+    /// already moved or dropped them). No-op if `cap == 0`.
+    ///
+    /// `#[inline]` so the `cap == 0` early return collapses at the `Drop` call site for
+    /// empty containers — saves a function call on what is otherwise a one-instruction path.
+    #[inline]
+    fn deallocate_old(&mut self) {
+        if self.cap == 0 || size_of::<T>() == 0 {
+            return;
+        }
+        let old_layout =
+            Layout::array::<T>(self.cap as usize).expect("TinyVec layout was valid when allocated");
+        // SAFETY: ptr came from `alloc` with this layout in `realloc_to`.
+        unsafe {
+            dealloc(self.ptr.as_ptr() as *mut u8, old_layout);
+        }
+    }
+
+    /// Shrinks the heap buffer to fit `len`, freeing it entirely if `len == 0`.
+    pub fn shrink_to_fit(&mut self) {
+        if (self.len as usize) == (self.cap as usize) {
+            return;
+        }
+        if self.len == 0 {
+            // Free the buffer entirely.
+            self.deallocate_old();
+            self.ptr = NonNull::dangling();
+            self.cap = 0;
+            return;
+        }
+        let new_cap = self.len as usize;
+        // Allocate a smaller buffer, copy, free old.
+        let new_layout = Layout::array::<T>(new_cap).expect("TinyVec layout overflow");
+        // SAFETY: layout is nonzero (new_cap > 0, T is nonzero-sized — ZST early-returned via the
+        // len == cap check above since cap = 0 for ZSTs would also trigger the equal branch).
+        let new_ptr = unsafe { alloc(new_layout) } as *mut T;
+        let new_ptr = match NonNull::new(new_ptr) {
+            Some(p) => p,
+            None => handle_alloc_error(new_layout),
+        };
+        // SAFETY: old buffer holds `len` initialized Ts.
+        unsafe {
+            ptr::copy_nonoverlapping(self.ptr.as_ptr(), new_ptr.as_ptr(), self.len());
+        }
+        self.deallocate_old();
+        self.ptr = new_ptr;
+        self.cap = new_cap as u8;
+    }
+}
+
+// `Index<usize>` / `IndexMut<usize>` are reachable through `Deref<Target=[T]>` —
+// `[T]: Index<usize>` and autoderef makes `tv[i]` work. No need to implement them here.
+
+impl<T, const MAX: u8> std::ops::Deref for TinyVec<T, MAX> {
+    type Target = [T];
+    fn deref(&self) -> &[T] {
+        self.as_slice()
+    }
+}
+
+impl<T, const MAX: u8> std::ops::DerefMut for TinyVec<T, MAX> {
+    fn deref_mut(&mut self) -> &mut [T] {
+        self.as_mut_slice()
+    }
+}
+
+impl<T, const MAX: u8> TinyVec<T, MAX> {
+    /// Extend from an exact-sized iterator: reserves exactly once before the loop,
+    /// avoiding the `size_hint().0` lower-bound dance.
+    ///
+    /// All in-tree callers feed exact-sized iterators (typically `Vec::IntoIter` from
+    /// `TinyVec::into_iter`), so we expose this as the preferred API. The `Extend` trait
+    /// impl below stays for compatibility with generic code.
+    pub fn extend_exact<I>(&mut self, iter: I)
+    where
+        I: IntoIterator<Item = T>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let iter = iter.into_iter();
+        self.reserve(iter.len());
+        for item in iter {
+            self.push(item);
+        }
+    }
+}
+
+impl<T, const MAX: u8> IntoIterator for TinyVec<T, MAX> {
+    type Item = T;
+    type IntoIter = std::vec::IntoIter<T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        // Delegate to `Vec::IntoIter` rather than maintaining our own. ManuallyDrop the
+        // self so its Drop doesn't fire — the reconstructed Vec now owns the buffer.
+        let me = ManuallyDrop::new(self);
+        // SAFETY: by struct invariant, `(self.ptr, self.len, self.cap)` is a valid
+        // `Vec::from_raw_parts` triple.
+        unsafe { Vec::from_raw_parts(me.ptr.as_ptr(), me.len as usize, me.cap as usize) }
+            .into_iter()
+    }
+}
+
+// `for x in &tv` and `for x in &mut tv` require `&TinyVec` / `&mut TinyVec` to implement
+// `IntoIterator`. The `for` loop's desugaring doesn't apply `Deref` coercion across the
+// reference boundary, so we need these explicit impls. They're trivial — just dispatch to
+// the slice iterators reached through `Deref`.
+impl<'a, T, const MAX: u8> IntoIterator for &'a TinyVec<T, MAX> {
+    type Item = &'a T;
+    type IntoIter = std::slice::Iter<'a, T>;
+    fn into_iter(self) -> std::slice::Iter<'a, T> {
+        self.as_slice().iter()
+    }
+}
+
+impl<'a, T, const MAX: u8> IntoIterator for &'a mut TinyVec<T, MAX> {
+    type Item = &'a mut T;
+    type IntoIter = std::slice::IterMut<'a, T>;
+    fn into_iter(self) -> std::slice::IterMut<'a, T> {
+        self.as_mut_slice().iter_mut()
+    }
+}
+
+impl<T: fmt::Debug, const MAX: u8> fmt::Debug for TinyVec<T, MAX> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list().entries(self.iter()).finish()
+    }
+}
+
+impl<T, const MAX: u8> Drop for TinyVec<T, MAX> {
+    #[inline]
+    fn drop(&mut self) {
+        // Fast path for empty containers: skip both the drop_in_place and deallocate calls.
+        // Hot because `TinyVec::default()` followed by immediate drop is a common idiom in
+        // benchmarks and in the steady-state of tasks that never allocate anything lazy.
+        if self.cap == 0 {
+            return;
+        }
+        // Drop populated elements in place.
+        if self.len > 0 {
+            // SAFETY: we own `len` initialized elements at the start of the buffer.
+            unsafe {
+                ptr::drop_in_place(std::ptr::slice_from_raw_parts_mut(
+                    self.ptr.as_ptr(),
+                    self.len(),
+                ));
+            }
+        }
+        self.deallocate_old();
+    }
+}
+
+impl<T, const MAX: u8> shrink_to_fit::ShrinkToFit for TinyVec<T, MAX> {
+    fn shrink_to_fit(&mut self) {
+        Self::shrink_to_fit(self);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test helper: build a TinyVec from an exact-sized iterator. Replaces the previous use
+    /// of `Iterator::collect()` after we removed the `FromIterator` impl.
+    fn from_exact<T, I, const MAX: u8>(iter: I) -> TinyVec<T, MAX>
+    where
+        I: IntoIterator<Item = T>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let mut v = TinyVec::new();
+        v.extend_exact(iter);
+        v
+    }
+
+    #[test]
+    fn size() {
+        // The whole point: 16 B on 64-bit, vs 24 B for Vec.
+        assert_eq!(std::mem::size_of::<TinyVec<u64>>(), 16);
+        assert_eq!(std::mem::size_of::<TinyVec<[u8; 48]>>(), 16);
+    }
+
+    #[test]
+    fn push_iter_swap_remove() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        assert!(v.is_empty());
+        v.push(10);
+        v.push(20);
+        v.push(30);
+        assert_eq!(v.len(), 3);
+        assert_eq!(v.iter().copied().collect::<Vec<_>>(), vec![10, 20, 30]);
+        let removed = v.swap_remove(0);
+        assert_eq!(removed, 10);
+        // After swap_remove(0), buffer is [30, 20] (last swapped into hole).
+        assert_eq!(v.iter().copied().collect::<Vec<_>>(), vec![30, 20]);
+        assert_eq!(v[0], 30);
+        assert_eq!(v[1], 20);
+    }
+
+    #[test]
+    fn growth_pattern() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        for i in 0..32u32 {
+            v.push(i);
+        }
+        assert_eq!(v.len(), 32);
+        let collected: Vec<u32> = v.iter().copied().collect();
+        assert_eq!(collected, (0..32).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn extend_and_reserve() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        v.extend_exact(0..10);
+        assert_eq!(v.len(), 10);
+        v.reserve(5);
+        assert!(v.capacity() >= 15);
+    }
+
+    #[test]
+    fn last_mut_and_index_mut() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        v.push(1);
+        v.push(2);
+        *v.last_mut().unwrap() = 99;
+        assert_eq!(v[1], 99);
+        v[0] = 7;
+        assert_eq!(v[0], 7);
+    }
+
+    #[test]
+    fn drop_runs_on_elements() {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+
+        struct DropCounter<'a>(&'a AtomicUsize);
+        impl<'a> Drop for DropCounter<'a> {
+            fn drop(&mut self) {
+                self.0.fetch_add(1, Ordering::SeqCst);
+            }
+        }
+
+        let count = AtomicUsize::new(0);
+        {
+            let mut v: TinyVec<DropCounter<'_>> = TinyVec::new();
+            v.push(DropCounter(&count));
+            v.push(DropCounter(&count));
+            v.push(DropCounter(&count));
+        }
+        assert_eq!(count.load(Ordering::SeqCst), 3);
+    }
+
+    #[test]
+    fn into_iter_drops_and_yields() {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+
+        struct DropCounter<'a>(&'a AtomicUsize, u32);
+        impl<'a> Drop for DropCounter<'a> {
+            fn drop(&mut self) {
+                self.0.fetch_add(1, Ordering::SeqCst);
+            }
+        }
+
+        let count = AtomicUsize::new(0);
+        let mut v: TinyVec<DropCounter<'_>> = TinyVec::new();
+        v.push(DropCounter(&count, 1));
+        v.push(DropCounter(&count, 2));
+        v.push(DropCounter(&count, 3));
+
+        let mut iter = v.into_iter();
+        assert_eq!(iter.next().unwrap().1, 1);
+        assert_eq!(iter.next().unwrap().1, 2);
+        // Drop iterator with one remaining element.
+        drop(iter);
+        // 3 total drops: the two yielded + the one remaining in the iter.
+        assert_eq!(count.load(Ordering::SeqCst), 3);
+    }
+
+    #[test]
+    fn shrink_to_fit_releases_buffer() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        v.extend_exact(0..10);
+        assert!(v.capacity() >= 10);
+        for _ in 0..10 {
+            v.swap_remove(0);
+        }
+        assert!(v.is_empty());
+        v.shrink_to_fit();
+        assert_eq!(v.capacity(), 0);
+    }
+
+    #[test]
+    #[should_panic(expected = "TinyVec capacity overflow")]
+    fn capacity_overflow_panics() {
+        let mut v: TinyVec<u8> = TinyVec::new();
+        for _ in 0..255u32 {
+            v.push(0);
+        }
+        // The 256th push trips the MAX check (default MAX = u8::MAX = 255).
+        v.push(0);
+    }
+
+    /// `MAX` strictly caps push count; growth stops at exactly MAX even when doubling would
+    /// overshoot.
+    #[test]
+    fn tight_max_caps_growth_exactly() {
+        let mut v: TinyVec<u32, 5> = TinyVec::new();
+        for i in 0..5 {
+            v.push(i);
+        }
+        assert_eq!(v.len(), 5);
+        // Capacity should be exactly 5, not the next-power-of-two (8).
+        assert_eq!(v.capacity(), 5);
+    }
+
+    #[test]
+    #[should_panic(expected = "TinyVec capacity overflow")]
+    fn tight_max_panics_at_limit() {
+        let mut v: TinyVec<u32, 3> = TinyVec::new();
+        v.push(0);
+        v.push(1);
+        v.push(2);
+        // The 4th push exceeds MAX=3.
+        v.push(3);
+    }
+
+    /// Confirms the growth schedule with tight MAX: doubles until it would exceed MAX, then
+    /// caps. With MAX=10 we should see 0 -> 4 -> 8 -> 10.
+    #[test]
+    fn tight_max_growth_schedule() {
+        let mut v: TinyVec<u32, 10> = TinyVec::new();
+        let mut last_cap = 0;
+        let mut cap_changes = Vec::new();
+        for i in 0..10 {
+            v.push(i);
+            if v.capacity() != last_cap {
+                cap_changes.push(v.capacity());
+                last_cap = v.capacity();
+            }
+        }
+        assert_eq!(cap_changes, vec![4, 8, 10]);
+    }
+
+    #[test]
+    fn retain_mut_basic() {
+        let mut v: TinyVec<u32> = from_exact(0..10);
+        v.retain_mut(|x| *x % 2 == 0);
+        assert_eq!(v.iter().copied().collect::<Vec<_>>(), vec![0, 2, 4, 6, 8]);
+        // retain_mut shouldn't change capacity.
+        assert!(v.capacity() >= 5);
+    }
+
+    #[test]
+    fn retain_mut_can_mutate() {
+        let mut v: TinyVec<u32> = from_exact(0..5);
+        v.retain_mut(|x| {
+            *x *= 10;
+            *x != 30
+        });
+        assert_eq!(v.iter().copied().collect::<Vec<_>>(), vec![0, 10, 20, 40]);
+    }
+
+    #[test]
+    fn retain_mut_empty() {
+        let mut v: TinyVec<u32> = TinyVec::new();
+        v.retain_mut(|_| panic!("should not be called for empty"));
+        assert!(v.is_empty());
+    }
+
+    #[test]
+    fn retain_mut_keeps_all() {
+        let mut v: TinyVec<u32> = from_exact(0..5);
+        v.retain_mut(|_| true);
+        assert_eq!(v.iter().copied().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4]);
+    }
+
+    #[test]
+    fn retain_mut_removes_all() {
+        let mut v: TinyVec<u32> = from_exact(0..5);
+        v.retain_mut(|_| false);
+        assert!(v.is_empty());
+    }
+
+    /// Verifies retain_mut's panic guard: if the predicate panics, we shouldn't double-free.
+    #[test]
+    fn retain_mut_panic_safety() {
+        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let mut v: TinyVec<u32> = from_exact(0..10);
+            v.retain_mut(|x| {
+                if *x == 5 {
+                    panic!("boom");
+                }
+                true
+            });
+        }));
+        assert!(result.is_err());
+    }
+
+    /// Element Drop panic during retain_mut — `Vec::retain_mut` handles this; we should too.
+    #[test]
+    fn retain_mut_element_drop_panic() {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+
+        struct PanicyDrop<'a>(u32, &'a AtomicUsize);
+        impl Drop for PanicyDrop<'_> {
+            fn drop(&mut self) {
+                self.1.fetch_add(1, Ordering::SeqCst);
+                if self.0 == 5 && !std::thread::panicking() {
+                    panic!("boom from drop");
+                }
+            }
+        }
+
+        let drop_count = AtomicUsize::new(0);
+        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
+            let mut v: TinyVec<PanicyDrop<'_>> =
+                from_exact((0..10).map(|i| PanicyDrop(i, &drop_count)));
+            v.retain_mut(|x| x.0 != 5); // schedules drop of element with 0==5, which panics
+            // If we get here without panic, drop happened cleanly.
+        }));
+        // The panic should have propagated; some drops should have occurred.
+        assert!(result.is_err() || drop_count.load(Ordering::SeqCst) > 0);
+    }
+}