[Web] Improve large tensor loading in wasm runtime

MakotoUwu · MakotoUwu · commit 42ed716d0692 · 2026-06-23T16:13:26.000+02:00
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
@@ -130,32 +130,51 @@ void ArrayDecodeStorage(Tensor cpu_arr, TVMFFIByteArray* bytes, const std::strin
   const char* byte_data = bytes->data;
   const size_t byte_size = bytes->size;
   if (format == "f32-to-bf16" && dtype == "float32") {
-    const uint16_t* bf16 = reinterpret_cast<const uint16_t*>(byte_data);
-    uint32_t* data = static_cast<uint32_t*>(cpu_arr->data);
     TVM_FFI_ICHECK(cpu_arr.IsContiguous());
     size_t size = 1;
     for (int i = 0; i < cpu_arr->ndim; ++i) {
       size *= cpu_arr->shape[i];
     }
-    TVM_FFI_ICHECK_EQ(size, byte_size / 2);
-    for (size_t i = 0; i < size; ++i) {
-      data[i] = static_cast<uint32_t>(bf16[i]) << 16;
+    // The "f32-to-bf16" format encodes a float32 tensor as packed bf16 (2
+    // bytes per element). When the byte_size matches that expectation, expand
+    // back to f32. If the byte_size matches the native float32 width
+    // (4 bytes per element), the payload is already raw float32; fall through
+    // to the generic byte copy. This makes the loader tolerant of weight
+    // shards produced by older / alternate quantisation pipelines that retain
+    // the "f32-to-bf16" tag without performing the bf16 truncation.
+    if (byte_size == size * sizeof(uint16_t)) {
+      const uint16_t* bf16 = reinterpret_cast<const uint16_t*>(byte_data);
+      uint32_t* data =
+          reinterpret_cast<uint32_t*>(static_cast<char*>(cpu_arr->data) + cpu_arr->byte_offset);
+      for (size_t i = 0; i < size; ++i) {
+        data[i] = static_cast<uint32_t>(bf16[i]) << 16;
+      }
+      return;
     }
-  } else {
-    cpu_arr.CopyFromBytes(byte_data, byte_size);
   }
+  cpu_arr.CopyFromBytes(byte_data, byte_size);
+}
+
+int64_t StorageSizeBytes(int64_t num_elements, const std::string& dtype) {
+  TVM_FFI_ICHECK_GE(num_elements, 0);
+  TVMFFIByteArray dtype_bytes{dtype.data(), dtype.size()};
+  DLDataType dl_dtype;
+  TVM_FFI_ICHECK_EQ(TVMFFIDataTypeFromString(&dtype_bytes, &dl_dtype), 0);
+  return static_cast<int64_t>(
+      ffi::GetDataSize(static_cast<size_t>(num_elements), dl_dtype));
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed(
-      "tvmjs.array.decode_storage", [](ffi::PackedArgs args, ffi::Any* ret) {
+  refl::GlobalDef()
+      .def_packed("tvmjs.array.decode_storage", [](ffi::PackedArgs args, ffi::Any* ret) {
         Tensor cpu_arr = args[0].cast<Tensor>();
         TVMFFIByteArray* bytes = args[1].cast<TVMFFIByteArray*>();
         std::string format = args[2].cast<ffi::String>().operator std::string();
         std::string dtype = args[3].cast<ffi::String>().operator std::string();
         ArrayDecodeStorage(cpu_arr, bytes, format, dtype);
-      });
+      })
+      .def("tvmjs.runtime.StorageSizeBytes", StorageSizeBytes);
 }
 
 // Concatenate n TVMArrays
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
@@ -172,6 +172,7 @@ class RuntimeContext implements Disposable {
   tensorCacheRemove: PackedFunc;
   tensorCacheClear: PackedFunc;
   arrayDecodeStorage: PackedFunc;
+  storageSizeBytes: PackedFunc;
   paramModuleFromCache: PackedFunc;
   paramModuleFromCacheByName: PackedFunc;
   makeShapeTuple: PackedFunc;
@@ -207,6 +208,7 @@ class RuntimeContext implements Disposable {
     this.tensorCacheUpdate = getGlobalFunc("vm.builtin.tensor_cache.update");
     this.tensorCacheClear = getGlobalFunc("vm.builtin.tensor_cache.clear");
     this.arrayDecodeStorage = getGlobalFunc("tvmjs.array.decode_storage");
+    this.storageSizeBytes = getGlobalFunc("tvmjs.runtime.StorageSizeBytes");
     this.paramModuleFromCache = getGlobalFunc("vm.builtin.param_module_from_cache");
     this.paramModuleFromCacheByName = getGlobalFunc("vm.builtin.param_module_from_cache_by_name");
     this.makeShapeTuple = getGlobalFunc("ffi.Shape");
@@ -230,6 +232,7 @@ class RuntimeContext implements Disposable {
     this.tensorCacheRemove.dispose();
     this.tensorCacheUpdate.dispose();
     this.arrayDecodeStorage.dispose();
+    this.storageSizeBytes.dispose();
     this.paramModuleFromCache.dispose();
     this.paramModuleFromCacheByName.dispose();
     this.makeShapeTuple.dispose();
@@ -1010,9 +1013,11 @@ export class Instance implements Disposable {
    */
   withNewScope<T>(action: () => T): T {
     this.beginScope();
-    const val = action();
-    this.endScope();
-    return val;
+    try {
+      return action();
+    } finally {
+      this.endScope();
+    }
   }
 
   /**
@@ -1323,6 +1328,19 @@ export class Instance implements Disposable {
     artifactCache: ArtifactCacheTemplate,
     signal?: AbortSignal,
   ) {
+    // Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB
+    // tensor-cache records. The cap is a conservative per-call staging size,
+    // independent of the final tensor allocation size. Smaller records keep
+    // the existing full-record path.
+    const maxChunkBytes = 128 * 1024 * 1024;
+    const storageSizeBytes = (numElements: number, dtype: string): number | undefined => {
+      try {
+        return this.ctx.storageSizeBytes(new Scalar(numElements, "int"), dtype) as number;
+      } catch {
+        // Unknown dtypes can still use the original full-record loading path.
+        return undefined;
+      }
+    };
     const perf = compact.getPerformance();
     const tstart = perf.now();
     let totalBytes = 0;
@@ -1421,9 +1439,68 @@ export class Instance implements Disposable {
               this.empty(rec.shape, rec.dtype, this.cpu())
             )
           });
-          const recSource = buffer.slice(rec.byteOffset, rec.byteOffset + rec.nbytes);
+          const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer);
+          const recSource =
+            rec.byteOffset === 0 && rec.nbytes === shardBytes.byteLength
+              ? shardBytes
+              : shardBytes.subarray(rec.byteOffset, rec.byteOffset + rec.nbytes);
+          let canChunkRecord =
+            rec.nbytes > maxChunkBytes &&
+            rec.shape.length >= 1 &&
+            Number.isInteger(rec.shape[0]) &&
+            rec.shape[0] > 0 &&
+            rec.nbytes % rec.shape[0] === 0;
+          const outerDim = canChunkRecord ? rec.shape[0] : 1;
+          const sourceStrideBytes = canChunkRecord ? rec.nbytes / outerDim : rec.nbytes;
+          let targetStrideBytes = 0;
+          if (canChunkRecord) {
+            const numElements = rec.shape.reduce((acc, value) => acc * value, 1);
+            const targetBytes = storageSizeBytes(numElements, rec.dtype);
+            canChunkRecord =
+              sourceStrideBytes <= maxChunkBytes &&
+              targetBytes !== undefined &&
+              targetBytes % outerDim === 0;
+            if (canChunkRecord) {
+              targetStrideBytes = targetBytes / outerDim;
+            }
+          }
+          const copyRecordToTensor = (targetTensor: Tensor, sourceBytes: Uint8Array) => {
+            if (!canChunkRecord) {
+              this.ctx.arrayDecodeStorage(targetTensor, sourceBytes, rec.format, rec.dtype);
+              return;
+            }
+            const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes));
+            for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) {
+              const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset);
+              const sourceByteOffset = outerOffset * sourceStrideBytes;
+              const targetByteOffset = outerOffset * targetStrideBytes;
+              const chunkBytes = outerCount * sourceStrideBytes;
+              const chunkShape = rec.shape.slice();
+              chunkShape[0] = outerCount;
+              const chunkView = this.withNewScope(() => {
+                const chunkShapeTuple = this.makeShapeTuple(chunkShape);
+                return this.detachFromCurrentScope(
+                  this.ctx.tensorCreateView(
+                    targetTensor,
+                    chunkShapeTuple,
+                    rec.dtype,
+                    new Scalar(targetByteOffset, "int"),
+                  )
+                );
+              });
+              const chunkSource = sourceBytes.subarray(
+                sourceByteOffset,
+                sourceByteOffset + chunkBytes,
+              );
+              try {
+                this.ctx.arrayDecodeStorage(chunkView, chunkSource, rec.format, rec.dtype);
+              } finally {
+                chunkView.dispose();
+              }
+            }
+          };
           // first sync copy to cpu.
-          this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), rec.format, rec.dtype);
+          copyRecordToTensor(cpu_arr, recSource);
           // then async stream into GPU if needed
           if (device.deviceType === DeviceStrToEnum.cpu) {
             this.tensorCacheUpdate(rec.name, cpu_arr, false);
@@ -1435,7 +1512,42 @@ export class Instance implements Disposable {
                 this.empty(rec.shape, rec.dtype, device)
               )
             });
-            gpu_arr.copyFrom(cpu_arr);
+            if (!canChunkRecord) {
+              gpu_arr.copyFrom(cpu_arr);
+            } else {
+              const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes));
+              for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) {
+                const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset);
+                const targetByteOffset = outerOffset * targetStrideBytes;
+                const chunkShape = rec.shape.slice();
+                chunkShape[0] = outerCount;
+                const [cpuView, gpuView] = this.withNewScope(() => {
+                  const chunkShapeTuple = this.makeShapeTuple(chunkShape);
+                  const cView = this.ctx.tensorCreateView(
+                    cpu_arr,
+                    chunkShapeTuple,
+                    rec.dtype,
+                    new Scalar(targetByteOffset, "int"),
+                  );
+                  const gView = this.ctx.tensorCreateView(
+                    gpu_arr,
+                    chunkShapeTuple,
+                    rec.dtype,
+                    new Scalar(targetByteOffset, "int"),
+                  );
+                  return [
+                    this.detachFromCurrentScope(cView),
+                    this.detachFromCurrentScope(gView),
+                  ];
+                });
+                try {
+                  gpuView.copyFrom(cpuView);
+                } finally {
+                  cpuView.dispose();
+                  gpuView.dispose();
+                }
+              }
+            }
             await device.sync();
             this.tensorCacheUpdate(rec.name, gpu_arr, false);
             cpu_arr.dispose();
@@ -2258,6 +2370,28 @@ export class Instance implements Disposable {
       case TypeIndex.kTVMFFIOpaquePtr: {
         return this.memory.loadPointer(valuePtr);
       }
+      case TypeIndex.kTVMFFIShape: {
+        const shapeObjPtr = this.memory.loadPointer(valuePtr);
+        if (shapeObjPtr === 0) {
+          return null;
+        }
+        if (callbackArg) {
+          const shapeCellPtr = shapeObjPtr + SizeOf.ObjectHeader;
+          const shapeDataPtr = this.memory.loadPointer(shapeCellPtr);
+          const shapeLen = this.memory.loadUSize(shapeCellPtr + this.memory.sizeofPtr());
+          const result = new Array<number>(shapeLen);
+          for (let i = 0; i < shapeLen; ++i) {
+            result[i] = this.memory.loadI64(shapeDataPtr + i * SizeOf.I64);
+          }
+          this.lib.checkCall(
+            (this.lib.exports.TVMFFIObjectDecRef as ctypes.FTVMFFIObjectDecRef)(shapeObjPtr)
+          );
+          return result;
+        }
+        return this.ctx.attachToCurrentScope(
+          new TVMObject(shapeObjPtr, this.lib, this.ctx)
+        );
+      }
       case TypeIndex.kTVMFFITensor: {
         return this.ctx.attachToCurrentScope(
           new Tensor(this.memory.loadPointer(valuePtr), this.lib, this.ctx, false)