@@ -1323,6 +1323,20 @@ export class Instance implements Disposable {
13231323 artifactCache : ArtifactCacheTemplate ,
13241324 signal ?: AbortSignal ,
13251325 ) {
1326+ // Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB
1327+ // tensor-cache records. The cap is a conservative per-call staging size,
1328+ // independent of the final tensor allocation size. Smaller records keep
1329+ // the existing full-record path.
1330+ const maxChunkBytes = 128 * 1024 * 1024 ;
1331+ const storageBytes = ( dtype : string ) => {
1332+ const match = dtype . match ( / ( \d + ) (?: x ( \d + ) ) ? $ / ) ;
1333+ if ( match === null ) {
1334+ throw new Error ( "Cannot determine storage width of dtype " + dtype ) ;
1335+ }
1336+ const bits = Number ( match [ 1 ] ) ;
1337+ const lanes = match [ 2 ] === undefined ? 1 : Number ( match [ 2 ] ) ;
1338+ return ( bits * lanes + 7 ) >> 3 ;
1339+ } ;
13261340 const perf = compact . getPerformance ( ) ;
13271341 const tstart = perf . now ( ) ;
13281342 let totalBytes = 0 ;
@@ -1421,9 +1435,59 @@ export class Instance implements Disposable {
14211435 this . empty ( rec . shape , rec . dtype , this . cpu ( ) )
14221436 )
14231437 } ) ;
1424- const recSource = buffer . slice ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1438+ const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array ( buffer ) ;
1439+ const recSource =
1440+ rec . byteOffset === 0 && rec . nbytes === shardBytes . byteLength
1441+ ? shardBytes
1442+ : shardBytes . subarray ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1443+ const canChunkRecord =
1444+ rec . nbytes > maxChunkBytes &&
1445+ rec . shape . length >= 1 &&
1446+ Number . isInteger ( rec . shape [ 0 ] ) &&
1447+ rec . shape [ 0 ] > 0 &&
1448+ rec . nbytes % rec . shape [ 0 ] === 0 ;
1449+ const outerDim = canChunkRecord ? rec . shape [ 0 ] : 1 ;
1450+ const sourceStrideBytes = canChunkRecord ? rec . nbytes / outerDim : rec . nbytes ;
1451+ const targetBytes = rec . shape . reduce ( ( acc , value ) => acc * value , 1 ) *
1452+ storageBytes ( rec . dtype ) ;
1453+ const targetStrideBytes = canChunkRecord ? targetBytes / outerDim : targetBytes ;
1454+ const copyRecordToTensor = ( targetTensor : Tensor , sourceBytes : Uint8Array ) => {
1455+ if ( ! canChunkRecord ) {
1456+ this . ctx . arrayDecodeStorage ( targetTensor , sourceBytes , rec . format , rec . dtype ) ;
1457+ return ;
1458+ }
1459+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1460+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1461+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1462+ const sourceByteOffset = outerOffset * sourceStrideBytes ;
1463+ const targetByteOffset = outerOffset * targetStrideBytes ;
1464+ const chunkBytes = outerCount * sourceStrideBytes ;
1465+ const chunkShape = rec . shape . slice ( ) ;
1466+ chunkShape [ 0 ] = outerCount ;
1467+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1468+ const chunkView = this . withNewScope ( ( ) => {
1469+ return this . detachFromCurrentScope (
1470+ this . ctx . tensorCreateView (
1471+ targetTensor ,
1472+ chunkShapeTuple ,
1473+ rec . dtype ,
1474+ new Scalar ( targetByteOffset , "int" ) ,
1475+ )
1476+ ) ;
1477+ } ) ;
1478+ const chunkSource = sourceBytes . subarray (
1479+ sourceByteOffset ,
1480+ sourceByteOffset + chunkBytes ,
1481+ ) ;
1482+ try {
1483+ this . ctx . arrayDecodeStorage ( chunkView , chunkSource , rec . format , rec . dtype ) ;
1484+ } finally {
1485+ chunkView . dispose ( ) ;
1486+ }
1487+ }
1488+ } ;
14251489 // first sync copy to cpu.
1426- this . ctx . arrayDecodeStorage ( cpu_arr , new Uint8Array ( recSource ) , rec . format , rec . dtype ) ;
1490+ copyRecordToTensor ( cpu_arr , recSource ) ;
14271491 // then async stream into GPU if needed
14281492 if ( device . deviceType === DeviceStrToEnum . cpu ) {
14291493 this . tensorCacheUpdate ( rec . name , cpu_arr , false ) ;
@@ -1435,7 +1499,44 @@ export class Instance implements Disposable {
14351499 this . empty ( rec . shape , rec . dtype , device )
14361500 )
14371501 } ) ;
1438- gpu_arr . copyFrom ( cpu_arr ) ;
1502+ if ( ! canChunkRecord ) {
1503+ gpu_arr . copyFrom ( cpu_arr ) ;
1504+ } else {
1505+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1506+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1507+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1508+ const targetByteOffset = outerOffset * targetStrideBytes ;
1509+ const chunkShape = rec . shape . slice ( ) ;
1510+ chunkShape [ 0 ] = outerCount ;
1511+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1512+ const [ cpuView , gpuView ] = this . withNewScope ( ( ) => {
1513+ return [
1514+ this . detachFromCurrentScope (
1515+ this . ctx . tensorCreateView (
1516+ cpu_arr ,
1517+ chunkShapeTuple ,
1518+ rec . dtype ,
1519+ new Scalar ( targetByteOffset , "int" ) ,
1520+ )
1521+ ) ,
1522+ this . detachFromCurrentScope (
1523+ this . ctx . tensorCreateView (
1524+ gpu_arr ,
1525+ chunkShapeTuple ,
1526+ rec . dtype ,
1527+ new Scalar ( targetByteOffset , "int" ) ,
1528+ )
1529+ ) ,
1530+ ] ;
1531+ } ) ;
1532+ try {
1533+ gpuView . copyFrom ( cpuView ) ;
1534+ } finally {
1535+ cpuView . dispose ( ) ;
1536+ gpuView . dispose ( ) ;
1537+ }
1538+ }
1539+ }
14391540 await device . sync ( ) ;
14401541 this . tensorCacheUpdate ( rec . name , gpu_arr , false ) ;
14411542 cpu_arr . dispose ( ) ;
@@ -2258,6 +2359,25 @@ export class Instance implements Disposable {
22582359 case TypeIndex . kTVMFFIOpaquePtr : {
22592360 return this . memory . loadPointer ( valuePtr ) ;
22602361 }
2362+ case TypeIndex . kTVMFFIShape : {
2363+ const shapeObjPtr = this . memory . loadPointer ( valuePtr ) ;
2364+ if ( callbackArg ) {
2365+ const shapeCellPtr = shapeObjPtr + SizeOf . ObjectHeader ;
2366+ const shapeDataPtr = this . memory . loadPointer ( shapeCellPtr ) ;
2367+ const shapeLen = this . memory . loadUSize ( shapeCellPtr + this . memory . sizeofPtr ( ) ) ;
2368+ const result = new Array < number > ( shapeLen ) ;
2369+ for ( let i = 0 ; i < shapeLen ; ++ i ) {
2370+ result [ i ] = this . memory . loadI64 ( shapeDataPtr + i * SizeOf . I64 ) ;
2371+ }
2372+ this . lib . checkCall (
2373+ ( this . lib . exports . TVMFFIObjectDecRef as ctypes . FTVMFFIObjectDecRef ) ( shapeObjPtr )
2374+ ) ;
2375+ return result ;
2376+ }
2377+ return this . ctx . attachToCurrentScope (
2378+ new TVMObject ( shapeObjPtr , this . lib , this . ctx )
2379+ ) ;
2380+ }
22612381 case TypeIndex . kTVMFFITensor : {
22622382 return this . ctx . attachToCurrentScope (
22632383 new Tensor ( this . memory . loadPointer ( valuePtr ) , this . lib , this . ctx , false )
0 commit comments