@@ -1010,9 +1010,11 @@ export class Instance implements Disposable {
10101010 */
10111011 withNewScope < T > ( action : ( ) => T ) : T {
10121012 this . beginScope ( ) ;
1013- const val = action ( ) ;
1014- this . endScope ( ) ;
1015- return val ;
1013+ try {
1014+ return action ( ) ;
1015+ } finally {
1016+ this . endScope ( ) ;
1017+ }
10161018 }
10171019
10181020 /**
@@ -1323,6 +1325,38 @@ export class Instance implements Disposable {
13231325 artifactCache : ArtifactCacheTemplate ,
13241326 signal ?: AbortSignal ,
13251327 ) {
1328+ // Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB
1329+ // tensor-cache records. The cap is a conservative per-call staging size,
1330+ // independent of the final tensor allocation size. Smaller records keep
1331+ // the existing full-record path.
1332+ const maxChunkBytes = 128 * 1024 * 1024 ;
1333+ const storageBytes = ( dtype : string ) => {
1334+ const vectorLaneSeparator = dtype . indexOf ( "x" ) ;
1335+ let scalarType = dtype ;
1336+ let lanes = 1 ;
1337+ if ( vectorLaneSeparator !== - 1 ) {
1338+ scalarType = dtype . slice ( 0 , vectorLaneSeparator ) ;
1339+ lanes = Number ( dtype . slice ( vectorLaneSeparator + 1 ) ) ;
1340+ }
1341+ if ( ! Number . isInteger ( lanes ) || lanes <= 0 ) {
1342+ throw new Error ( "Cannot determine storage width of dtype " + dtype ) ;
1343+ }
1344+
1345+ if ( scalarType === "bool" ) {
1346+ return lanes ;
1347+ }
1348+
1349+ for ( const prefix of [ "bfloat" , "float" , "uint" , "int" ] ) {
1350+ if ( scalarType . startsWith ( prefix ) ) {
1351+ const bits = Number ( scalarType . slice ( prefix . length ) ) ;
1352+ if ( Number . isInteger ( bits ) && bits > 0 ) {
1353+ return ( bits * lanes + 7 ) >> 3 ;
1354+ }
1355+ }
1356+ }
1357+
1358+ throw new Error ( "Cannot determine storage width of dtype " + dtype ) ;
1359+ } ;
13261360 const perf = compact . getPerformance ( ) ;
13271361 const tstart = perf . now ( ) ;
13281362 let totalBytes = 0 ;
@@ -1421,9 +1455,59 @@ export class Instance implements Disposable {
14211455 this . empty ( rec . shape , rec . dtype , this . cpu ( ) )
14221456 )
14231457 } ) ;
1424- const recSource = buffer . slice ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1458+ const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array ( buffer ) ;
1459+ const recSource =
1460+ rec . byteOffset === 0 && rec . nbytes === shardBytes . byteLength
1461+ ? shardBytes
1462+ : shardBytes . subarray ( rec . byteOffset , rec . byteOffset + rec . nbytes ) ;
1463+ const canChunkRecord =
1464+ rec . nbytes > maxChunkBytes &&
1465+ rec . shape . length >= 1 &&
1466+ Number . isInteger ( rec . shape [ 0 ] ) &&
1467+ rec . shape [ 0 ] > 0 &&
1468+ rec . nbytes % rec . shape [ 0 ] === 0 ;
1469+ const outerDim = canChunkRecord ? rec . shape [ 0 ] : 1 ;
1470+ const sourceStrideBytes = canChunkRecord ? rec . nbytes / outerDim : rec . nbytes ;
1471+ const targetBytes = rec . shape . reduce ( ( acc , value ) => acc * value , 1 ) *
1472+ storageBytes ( rec . dtype ) ;
1473+ const targetStrideBytes = canChunkRecord ? targetBytes / outerDim : targetBytes ;
1474+ const copyRecordToTensor = ( targetTensor : Tensor , sourceBytes : Uint8Array ) => {
1475+ if ( ! canChunkRecord ) {
1476+ this . ctx . arrayDecodeStorage ( targetTensor , sourceBytes , rec . format , rec . dtype ) ;
1477+ return ;
1478+ }
1479+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1480+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1481+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1482+ const sourceByteOffset = outerOffset * sourceStrideBytes ;
1483+ const targetByteOffset = outerOffset * targetStrideBytes ;
1484+ const chunkBytes = outerCount * sourceStrideBytes ;
1485+ const chunkShape = rec . shape . slice ( ) ;
1486+ chunkShape [ 0 ] = outerCount ;
1487+ const chunkView = this . withNewScope ( ( ) => {
1488+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1489+ return this . detachFromCurrentScope (
1490+ this . ctx . tensorCreateView (
1491+ targetTensor ,
1492+ chunkShapeTuple ,
1493+ rec . dtype ,
1494+ new Scalar ( targetByteOffset , "int" ) ,
1495+ )
1496+ ) ;
1497+ } ) ;
1498+ const chunkSource = sourceBytes . subarray (
1499+ sourceByteOffset ,
1500+ sourceByteOffset + chunkBytes ,
1501+ ) ;
1502+ try {
1503+ this . ctx . arrayDecodeStorage ( chunkView , chunkSource , rec . format , rec . dtype ) ;
1504+ } finally {
1505+ chunkView . dispose ( ) ;
1506+ }
1507+ }
1508+ } ;
14251509 // first sync copy to cpu.
1426- this . ctx . arrayDecodeStorage ( cpu_arr , new Uint8Array ( recSource ) , rec . format , rec . dtype ) ;
1510+ copyRecordToTensor ( cpu_arr , recSource ) ;
14271511 // then async stream into GPU if needed
14281512 if ( device . deviceType === DeviceStrToEnum . cpu ) {
14291513 this . tensorCacheUpdate ( rec . name , cpu_arr , false ) ;
@@ -1435,7 +1519,42 @@ export class Instance implements Disposable {
14351519 this . empty ( rec . shape , rec . dtype , device )
14361520 )
14371521 } ) ;
1438- gpu_arr . copyFrom ( cpu_arr ) ;
1522+ if ( ! canChunkRecord ) {
1523+ gpu_arr . copyFrom ( cpu_arr ) ;
1524+ } else {
1525+ const chunkOuterDim = Math . max ( 1 , Math . floor ( maxChunkBytes / sourceStrideBytes ) ) ;
1526+ for ( let outerOffset = 0 ; outerOffset < outerDim ; outerOffset += chunkOuterDim ) {
1527+ const outerCount = Math . min ( chunkOuterDim , outerDim - outerOffset ) ;
1528+ const targetByteOffset = outerOffset * targetStrideBytes ;
1529+ const chunkShape = rec . shape . slice ( ) ;
1530+ chunkShape [ 0 ] = outerCount ;
1531+ const [ cpuView , gpuView ] = this . withNewScope ( ( ) => {
1532+ const chunkShapeTuple = this . makeShapeTuple ( chunkShape ) ;
1533+ const cView = this . ctx . tensorCreateView (
1534+ cpu_arr ,
1535+ chunkShapeTuple ,
1536+ rec . dtype ,
1537+ new Scalar ( targetByteOffset , "int" ) ,
1538+ ) ;
1539+ const gView = this . ctx . tensorCreateView (
1540+ gpu_arr ,
1541+ chunkShapeTuple ,
1542+ rec . dtype ,
1543+ new Scalar ( targetByteOffset , "int" ) ,
1544+ ) ;
1545+ return [
1546+ this . detachFromCurrentScope ( cView ) ,
1547+ this . detachFromCurrentScope ( gView ) ,
1548+ ] ;
1549+ } ) ;
1550+ try {
1551+ gpuView . copyFrom ( cpuView ) ;
1552+ } finally {
1553+ cpuView . dispose ( ) ;
1554+ gpuView . dispose ( ) ;
1555+ }
1556+ }
1557+ }
14391558 await device . sync ( ) ;
14401559 this . tensorCacheUpdate ( rec . name , gpu_arr , false ) ;
14411560 cpu_arr . dispose ( ) ;
@@ -2258,6 +2377,28 @@ export class Instance implements Disposable {
22582377 case TypeIndex . kTVMFFIOpaquePtr : {
22592378 return this . memory . loadPointer ( valuePtr ) ;
22602379 }
2380+ case TypeIndex . kTVMFFIShape : {
2381+ const shapeObjPtr = this . memory . loadPointer ( valuePtr ) ;
2382+ if ( shapeObjPtr === 0 ) {
2383+ return null ;
2384+ }
2385+ if ( callbackArg ) {
2386+ const shapeCellPtr = shapeObjPtr + SizeOf . ObjectHeader ;
2387+ const shapeDataPtr = this . memory . loadPointer ( shapeCellPtr ) ;
2388+ const shapeLen = this . memory . loadUSize ( shapeCellPtr + this . memory . sizeofPtr ( ) ) ;
2389+ const result = new Array < number > ( shapeLen ) ;
2390+ for ( let i = 0 ; i < shapeLen ; ++ i ) {
2391+ result [ i ] = this . memory . loadI64 ( shapeDataPtr + i * SizeOf . I64 ) ;
2392+ }
2393+ this . lib . checkCall (
2394+ ( this . lib . exports . TVMFFIObjectDecRef as ctypes . FTVMFFIObjectDecRef ) ( shapeObjPtr )
2395+ ) ;
2396+ return result ;
2397+ }
2398+ return this . ctx . attachToCurrentScope (
2399+ new TVMObject ( shapeObjPtr , this . lib , this . ctx )
2400+ ) ;
2401+ }
22612402 case TypeIndex . kTVMFFITensor : {
22622403 return this . ctx . attachToCurrentScope (
22632404 new Tensor ( this . memory . loadPointer ( valuePtr ) , this . lib , this . ctx , false )
0 commit comments