leehack
diff --git a/‎AGENTS.md‎
Lines changed: 9 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 23 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 37 additions & 1 deletion b/‎README.md‎
Lines changed: 37 additions & 1 deletion
diff --git a/‎example/chat_app/README.md‎
Lines changed: 8 additions & 0 deletions b/‎example/chat_app/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎example/chat_app/dart_test.yaml‎
Lines changed: 5 additions & 0 deletions b/‎example/chat_app/dart_test.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎example/chat_app/integration_test/model_cache_mmproj_e2e_test.dart‎
Lines changed: 115 additions & 0 deletions b/‎example/chat_app/integration_test/model_cache_mmproj_e2e_test.dart‎
Lines changed: 115 additions & 0 deletions
@@ -91,6 +91,15 @@ pass `--mem64` and a smaller `--context-size` to keep the smoke bounded.
 - Parameter and return types documented
 - No TODO/FIXME comments in committed code
 
+### Changelog Discipline
+- Never add unreleased work to an already-published version section in
+  `CHANGELOG.md` or `website/docs/changelog/recent-releases.md`.
+- Before editing release notes, check the top of `CHANGELOG.md`. If the latest
+  section is a concrete released version (for example `## 0.6.12`), create a
+  new `## Unreleased` section above it and place new PR entries there.
+- Only move entries from `## Unreleased` into a numbered version section as part
+  of an explicit release/version-bump task.
+
 ### Error Handling
 - Use custom `LlamaException` hierarchy (defined in `lib/src/core/exceptions.dart`)
 - Subtypes: `LlamaModelException`, `LlamaContextException`, `LlamaInferenceException`, `LlamaStateException`, `LlamaUnsupportedException`
 
@@ -1,3 +1,26 @@
+## Unreleased
+
+* **Model source download/cache manager**:
+  * Added `ModelSource` for local paths, HTTP(S) URLs, and Hugging Face
+    `hf://owner/repo/path/to/model.gguf` references, including deterministic
+    cache keys and redacted metadata/log identities for signed URLs.
+  * Added `ModelLoadOptions`, `ModelCachePolicy`, resolver targets, and
+    download/cache metadata/progress value models for package-managed model
+    download and cache management.
+  * Added native/file-backed `DefaultModelDownloadManager` support for streaming
+    HTTP downloads, `.part` files, atomic promotion, persisted metadata,
+    authenticated bearer/custom headers, cancellation, retry, Range resume,
+    cache hit/refresh/cache-only/no-cache policies, SHA-256 verification,
+    cache listing, removal, clearing, and age/size pruning.
+  * Added `LlamaEngine.loadModelSource(...)` to route local sources through the
+    existing native local loader, remote sources through the native download
+    cache before local loading, and simple remote sources through URL-capable web
+    backends when available.
+  * Migrated server/testing helpers away from ad-hoc model downloads so examples
+    dogfood the package-managed cache manager.
+* **Compatibility note**: no public API breaking changes; the model source APIs
+  are additive and existing `loadModel(...)` callers are unchanged.
+
 ## 0.6.12
 
 * **Native runtime sync**:
 
@@ -26,6 +26,8 @@
   - Web: WebGPU via bridge runtime (with CPU fallback)
 - 🧭 **Embeddings API**: Generate vectors with `embed(...)` and
   `embedBatch(...)`.
+- 📦 **Structured Model Sources**: Describe local, HTTP(S), and Hugging Face
+  GGUF sources with deterministic cache identities for download/cache workflows.
 - 🖼️ **Multimodal Support**: Vision/audio model runtime support.
 - **LoRA Support**: Runtime GGUF adapter application.
 - 🔇 **Split Logging Control**: Dart logs and native logs can be configured independently.
@@ -88,7 +90,41 @@ Future<void> main() async {
 }
 ```
 
-### 5. Generate embeddings
+### 5. Download and cache a remote GGUF
+
+```dart
+import 'package:llamadart/llamadart.dart';
+
+Future<void> main() async {
+  final engine = LlamaEngine(LlamaBackend());
+  try {
+    await engine.loadModelSource(
+      ModelSource.parse('hf://owner/repo/model-Q4_K_M.gguf'),
+      options: ModelLoadOptions(
+        cachePolicy: ModelCachePolicy.preferCached,
+        cacheDirectory: '/path/to/app/model-cache',
+      ),
+      onProgress: (progress) {
+        final fraction = progress.fraction;
+        if (fraction != null) {
+          print('download ${(fraction * 100).toStringAsFixed(1)}%');
+        }
+      },
+    );
+  } finally {
+    await engine.dispose();
+  }
+}
+```
+
+Native/file-backed backends stream remote models into the package-managed cache,
+resume partial `.part` downloads when the server supports HTTP Range and the
+partial has a safe validator (ETag/Last-Modified) or caller-provided SHA-256,
+verify optional SHA-256 checksums, and redact signed URL credentials from
+metadata. Validator-less partial files restart from byte zero instead of being
+appended.
+
+### 6. Generate embeddings
 
 ```dart
 import 'package:llamadart/llamadart.dart';
 
@@ -38,6 +38,14 @@ flutter test
 
 Note: this is a Flutter app, so use `flutter test` (not `dart test`).
 
+Slow device E2E tests are tagged `local-only` and skipped by default. To run
+the real model/mmproj download-cache-load check manually on a selected device:
+
+```bash
+flutter test --run-skipped -t local-only \
+  integration_test/model_cache_mmproj_e2e_test.dart -d <device>
+```
+
 ### 2. Choose and Download a Model
 1. The app will open to a **Manage Models** screen.
 2. Select one of the pre-configured models (for example: FunctionGemma 270M, Qwen3.5 0.8B/2B/4B/9B, Llama 3.2 3B, Gemma 3/3n, DeepSeek R1 distills).
 
@@ -0,0 +1,5 @@
+tags:
+  local-only:
+    skip: "Runs only on local machines. Use: flutter test --run-skipped -t local-only integration_test/model_cache_mmproj_e2e_test.dart -d <device>"
+  e2e:
+    timeout: 30m
@@ -0,0 +1,115 @@
+@Tags(['local-only', 'e2e'])
+@Timeout(Duration(minutes: 30))
+/// Local-only chat app E2E for the model/mmproj download-cache-load path.
+///
+/// This downloads the default LFM2-VL 450M model and its mmproj, so it is
+/// intentionally skipped by default. Run it manually with:
+///
+/// ```bash
+/// cd example/chat_app
+/// flutter test --run-skipped -t local-only \
+///   integration_test/model_cache_mmproj_e2e_test.dart -d <device>
+/// ```
+library;
+
+import 'package:dio/dio.dart';
+import 'package:flutter/foundation.dart';
+import 'package:flutter_test/flutter_test.dart';
+import 'package:integration_test/integration_test.dart';
+import 'package:llamadart/llamadart.dart' show GpuBackend, LlamaLogLevel;
+import 'package:path/path.dart' as p;
+
+import 'package:llamadart_chat_example/models/chat_settings.dart';
+import 'package:llamadart_chat_example/models/downloadable_model.dart';
+import 'package:llamadart_chat_example/services/chat_service.dart';
+import 'package:llamadart_chat_example/services/model_service_base.dart';
+
+void main() {
+  IntegrationTestWidgetsFlutterBinding.ensureInitialized();
+
+  testWidgets(
+    'downloads, caches, and loads tiny multimodal model + mmproj',
+    (tester) async {
+      final model = DownloadableModel.defaultModels.firstWhere(
+        (candidate) => candidate.name == 'LFM2-VL 450M',
+      );
+      expect(model.multimodalProjectorSource, isNotNull);
+
+      final service = ModelService();
+      final modelsDir = await service.getModelsDirectory();
+
+      await service.deleteModel(modelsDir, model);
+      var downloaded = await service.getDownloadedModels([model]);
+      expect(downloaded.contains(model.filename), isFalse);
+
+      final stages = <ModelDownloadStage>{};
+      final progressEvents = <ModelDownloadProgress>[];
+      Object? downloadError;
+      String? successFilename;
+
+      await service.downloadModel(
+        model: model,
+        modelsDir: modelsDir,
+        cancelToken: CancelToken(),
+        onProgress: (_) {},
+        onProgressDetail: (detail) {
+          stages.add(detail.stage);
+          progressEvents.add(detail);
+          debugPrint(
+            'E2E download ${detail.stage.name} '
+            '${(detail.overallProgress * 100).toStringAsFixed(1)}% '
+            '${detail.stageDownloadedBytes}/${detail.stageTotalBytes ?? -1}',
+          );
+        },
+        onSuccess: (filename) {
+          successFilename = filename;
+        },
+        onError: (error) {
+          downloadError = error;
+        },
+      );
+
+      expect(downloadError, isNull);
+      expect(successFilename, model.filename);
+      expect(stages, contains(ModelDownloadStage.model));
+      expect(stages, contains(ModelDownloadStage.multimodalProjector));
+      expect(progressEvents.last.overallProgress, 1.0);
+
+      downloaded = await service.getDownloadedModels([model]);
+      expect(downloaded, contains(model.filename));
+
+      final modelSource = model.modelSource;
+      final mmprojSource = model.multimodalProjectorSource!;
+      final modelLoadRef = kIsWeb || modelSource is LocalModelAssetSource
+          ? modelSource.loadReference
+          : p.join(modelsDir, model.filename);
+      final mmprojLoadRef = kIsWeb || mmprojSource is LocalModelAssetSource
+          ? mmprojSource.loadReference
+          : p.join(modelsDir, mmprojSource.displayName);
+
+      final chatService = ChatService();
+      try {
+        await chatService.init(
+          ChatSettings(
+            modelPath: modelLoadRef,
+            mmprojPath: mmprojLoadRef,
+            preferredBackend: GpuBackend.cpu,
+            gpuLayers: 0,
+            contextSize: 512,
+            maxTokens: 32,
+            nativeLogLevel: LlamaLogLevel.warn,
+          ),
+          eagerLoadMultimodalProjector: true,
+          onProgress: (progress) =>
+              debugPrint('E2E load ${(progress * 100).toStringAsFixed(1)}%'),
+        );
+
+        expect(chatService.engine.isReady, isTrue);
+        expect(await chatService.engine.supportsVision, isTrue);
+      } finally {
+        await chatService.dispose();
+      }
+    },
+    timeout: const Timeout(Duration(minutes: 30)),
+  );
+}