@@ -1010,9 +1010,11 @@ export class Instance implements Disposable {
10101010 */
10111011 withNewScope < T > ( action : ( ) => T ) : T {
10121012 this . beginScope ( ) ;
1013- const val = action ( ) ;
1014- this . endScope ( ) ;
1015- return val ;
1013+ try {
1014+ return action ( ) ;
1015+ } finally {
1016+ this . endScope ( ) ;
1017+ }
10161018 }
10171019
10181020 /**
@@ -1323,6 +1325,23 @@ export class Instance implements Disposable {
13231325 artifactCache : ArtifactCacheTemplate ,
13241326 signal ?: AbortSignal ,
13251327 ) {
1328+ // Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB
1329+ // tensor-cache records. The cap is a conservative per-call staging size,
1330+ // independent of the final tensor allocation size. Smaller records keep
1331+ // the existing full-record path.
1332+ const maxChunkBytes = 128 * 1024 * 1024 ;
1333+ const storageBytes = ( dtype : string ) => {
1334+ if ( dtype === "bool" ) {
1335+ return 1 ;
1336+ }
1337+ const match = dtype . match ( / ( \d + ) (?: x ( \d + ) ) ? $ / ) ;
1338+ if ( match === null ) {
1339+ throw new Error ( "Cannot determine storage width of dtype " + dtype ) ;
1340+ }
1341+ const bits = Number ( match [ 1 ] ) ;
1342+ const lanes = match [ 2 ] === undefined ? 1 : Number ( match [ 2 ] ) ;
1343+ return ( bits * lanes + 7 ) >> 3 ;
1344+ } ;
13261345 const perf = compact . getPerformance ( ) ;
13271346 const tstart = perf . now ( ) ;
13281347 let totalBytes = 0 ;
@@ -1421,9 +1440,59 @@ export class Instance implements Disposable {
14211440 this . empty ( rec . shape , rec . dtype , this . cpu ( ) )
14221441 )
14231442 } ) ;
1424- const recSource = buffer . slice ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1443+ const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array ( buffer ) ;
1444+ const recSource =
1445+ rec . byteOffset === 0 && rec . nbytes === shardBytes . byteLength
1446+ ? shardBytes
1447+ : shardBytes . subarray ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1448+ const canChunkRecord =
1449+ rec . nbytes > maxChunkBytes &&
1450+ rec . shape . length >= 1 &&
1451+ Number . isInteger ( rec . shape [ 0 ] ) &&
1452+ rec . shape [ 0 ] > 0 &&
1453+ rec . nbytes % rec . shape [ 0 ] === 0 ;
1454+ const outerDim = canChunkRecord ? rec . shape [ 0 ] : 1 ;
1455+ const sourceStrideBytes = canChunkRecord ? rec . nbytes / outerDim : rec . nbytes ;
1456+ const targetBytes = rec . shape . reduce ( ( acc , value ) => acc * value , 1 ) *
1457+ storageBytes ( rec . dtype ) ;
1458+ const targetStrideBytes = canChunkRecord ? targetBytes / outerDim : targetBytes ;
1459+ const copyRecordToTensor = ( targetTensor : Tensor , sourceBytes : Uint8Array ) => {
1460+ if ( ! canChunkRecord ) {
1461+ this . ctx . arrayDecodeStorage ( targetTensor , sourceBytes , rec . format , rec . dtype ) ;
1462+ return ;
1463+ }
1464+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1465+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1466+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1467+ const sourceByteOffset = outerOffset * sourceStrideBytes ;
1468+ const targetByteOffset = outerOffset * targetStrideBytes ;
1469+ const chunkBytes = outerCount * sourceStrideBytes ;
1470+ const chunkShape = rec . shape . slice ( ) ;
1471+ chunkShape [ 0 ] = outerCount ;
1472+ const chunkView = this . withNewScope ( ( ) => {
1473+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1474+ return this . detachFromCurrentScope (
1475+ this . ctx . tensorCreateView (
1476+ targetTensor ,
1477+ chunkShapeTuple ,
1478+ rec . dtype ,
1479+ new Scalar ( targetByteOffset , "int" ) ,
1480+ )
1481+ ) ;
1482+ } ) ;
1483+ const chunkSource = sourceBytes . subarray (
1484+ sourceByteOffset ,
1485+ sourceByteOffset + chunkBytes ,
1486+ ) ;
1487+ try {
1488+ this . ctx . arrayDecodeStorage ( chunkView , chunkSource , rec . format , rec . dtype ) ;
1489+ } finally {
1490+ chunkView . dispose ( ) ;
1491+ }
1492+ }
1493+ } ;
14251494 // first sync copy to cpu.
1426- this . ctx . arrayDecodeStorage ( cpu_arr , new Uint8Array ( recSource ) , rec . format , rec . dtype ) ;
1495+ copyRecordToTensor ( cpu_arr , recSource ) ;
14271496 // then async stream into GPU if needed
14281497 if ( device . deviceType === DeviceStrToEnum . cpu ) {
14291498 this . tensorCacheUpdate ( rec . name , cpu_arr , false ) ;
@@ -1435,7 +1504,42 @@ export class Instance implements Disposable {
14351504 this . empty ( rec . shape , rec . dtype , device )
14361505 )
14371506 } ) ;
1438- gpu_arr . copyFrom ( cpu_arr ) ;
1507+ if ( ! canChunkRecord ) {
1508+ gpu_arr . copyFrom ( cpu_arr ) ;
1509+ } else {
1510+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1511+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1512+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1513+ const targetByteOffset = outerOffset * targetStrideBytes ;
1514+ const chunkShape = rec . shape . slice ( ) ;
1515+ chunkShape [ 0 ] = outerCount ;
1516+ const [ cpuView , gpuView ] = this . withNewScope ( ( ) => {
1517+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1518+ const cView = this . ctx . tensorCreateView (
1519+ cpu_arr ,
1520+ chunkShapeTuple ,
1521+ rec . dtype ,
1522+ new Scalar ( targetByteOffset , "int" ) ,
1523+ ) ;
1524+ const gView = this . ctx . tensorCreateView (
1525+ gpu_arr ,
1526+ chunkShapeTuple ,
1527+ rec . dtype ,
1528+ new Scalar ( targetByteOffset , "int" ) ,
1529+ ) ;
1530+ return [
1531+ this . detachFromCurrentScope ( cView ) ,
1532+ this . detachFromCurrentScope ( gView ) ,
1533+ ] ;
1534+ } ) ;
1535+ try {
1536+ gpuView . copyFrom ( cpuView ) ;
1537+ } finally {
1538+ cpuView . dispose ( ) ;
1539+ gpuView . dispose ( ) ;
1540+ }
1541+ }
1542+ }
14391543 await device . sync ( ) ;
14401544 this . tensorCacheUpdate ( rec . name , gpu_arr , false ) ;
14411545 cpu_arr . dispose ( ) ;
@@ -2258,6 +2362,28 @@ export class Instance implements Disposable {
22582362 case TypeIndex . kTVMFFIOpaquePtr : {
22592363 return this . memory . loadPointer ( valuePtr ) ;
22602364 }
2365+ case TypeIndex . kTVMFFIShape : {
2366+ const shapeObjPtr = this . memory . loadPointer ( valuePtr ) ;
2367+ if ( shapeObjPtr === 0 ) {
2368+ return null ;
2369+ }
2370+ if ( callbackArg ) {
2371+ const shapeCellPtr = shapeObjPtr + SizeOf . ObjectHeader ;
2372+ const shapeDataPtr = this . memory . loadPointer ( shapeCellPtr ) ;
2373+ const shapeLen = this . memory . loadUSize ( shapeCellPtr + this . memory . sizeofPtr ( ) ) ;
2374+ const result = new Array < number > ( shapeLen ) ;
2375+ for ( let i = 0 ; i < shapeLen ; ++ i ) {
2376+ result [ i ] = this . memory . loadI64 ( shapeDataPtr + i * SizeOf . I64 ) ;
2377+ }
2378+ this . lib . checkCall (
2379+ ( this . lib . exports . TVMFFIObjectDecRef as ctypes . FTVMFFIObjectDecRef ) ( shapeObjPtr )
2380+ ) ;
2381+ return result ;
2382+ }
2383+ return this . ctx . attachToCurrentScope (
2384+ new TVMObject ( shapeObjPtr , this . lib , this . ctx )
2385+ ) ;
2386+ }
22612387 case TypeIndex . kTVMFFITensor : {
22622388 return this . ctx . attachToCurrentScope (
22632389 new Tensor ( this . memory . loadPointer ( valuePtr ) , this . lib , this . ctx , false )
0 commit comments