feat(LlamaModel): useDirectIo

giladgd · giladgd · commit 2f2ef5896856 · 2026-01-08T20:37:44.000+02:00
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM
             model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("useDirectIo")) {
+            model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();
+        }
+
         if (options.Has("useMlock")) {
             model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
         }
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
@@ -9,6 +9,7 @@ export type BindingModule = {
             gpuLayers?: number,
             vocabOnly?: boolean,
             useMmap?: boolean,
+            useDirectIo?: boolean,
             useMlock?: boolean,
             checkTensors?: boolean,
             onLoadProgress?(loadPercentage: number): void,
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -72,6 +72,7 @@ type ChatCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -329,6 +330,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -342,7 +348,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
-        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo,
+        printTimings
     }) {
         try {
             await RunChat({
@@ -351,7 +358,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-                debug, numa, meter, timing, noMmap, printTimings
+                debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -368,7 +375,7 @@ async function RunChat({
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
-    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -395,6 +402,7 @@ async function RunChat({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -452,6 +460,7 @@ async function RunChat({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -486,6 +495,7 @@ async function RunChat({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -591,6 +601,7 @@ async function RunChat({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         printBos: true,
         printEos: true,
         logBatchSize,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
@@ -54,6 +54,7 @@ type CompleteCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, numa, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, noDirectIo, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -283,7 +289,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -308,6 +314,7 @@ async function RunCompletion({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -358,6 +365,7 @@ async function RunCompletion({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -392,6 +400,7 @@ async function RunCompletion({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -470,6 +479,7 @@ async function RunCompletion({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
@@ -56,6 +56,7 @@ type InfillCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -259,6 +260,11 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -271,14 +277,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, numa, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, noDirectIo, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -293,7 +299,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -318,6 +324,7 @@ async function RunInfill({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -382,6 +389,7 @@ async function RunInfill({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -416,6 +424,7 @@ async function RunInfill({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -494,6 +503,7 @@ async function RunInfill({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         logBatchSize,
         tokenMeterEnabled: meter
     });
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -22,6 +22,7 @@ import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {toBytes} from "../../../utils/toBytes.js";
 import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
+import {getPlatform} from "../../../../bindings/utils/getPlatform.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -37,6 +38,7 @@ type InspectMeasureCommand = {
     measures: number,
     memory: "vram" | "ram" | "all",
     noMmap: boolean,
+    noDirectIo: boolean,
     printHeaderBeforeEachLayer?: boolean,
     evaluateText?: string,
     repeatEvaluateText?: number
@@ -135,6 +137,11 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printHeaderBeforeEachLayer", {
                 alias: "ph",
                 type: "boolean",
@@ -155,7 +162,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
     },
     async handler({
         modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache,
-        batchSize, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
+        batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText,
+        repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
         if (maxContextSize === -1) maxContextSize = undefined;
@@ -174,7 +182,9 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 logLevel: LlamaLogLevel.error
             });
 
+        const platform = getPlatform();
         const useMmap = !noMmap && llama.supportsMmap;
+        const useDirectIo = !noDirectIo;
         const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
             flashAttention, swaFullCache, useMmap
         });
@@ -188,6 +198,14 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                     ? "enabled"
                     : "disabled"
         ));
+
+        if (platform !== "mac") // Direct I/O is not supported on macOS
+            console.info(chalk.yellow("Direct I/O:") + " " + (
+                useDirectIo
+                    ? "enabled"
+                    : "disabled"
+            ));
+
         if (measureMemoryType === "ram" || measureMemoryType === "all")
             console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
 
@@ -221,6 +239,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
             const done = await measureModel({
                 modelPath: resolvedGgufPath,
                 useMmap,
+                useDirectIo,
                 gpu: gpu == null
                     ? undefined
                     : llama.gpu,
@@ -513,11 +532,12 @@ const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
-    modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
-    swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
+    modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers,
+    flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
     useMmap?: boolean,
+    useDirectIo?: boolean,
     gpu?: BuildGpu | "auto",
     tests: number,
     initialMaxContextSize?: number,
@@ -628,6 +648,7 @@ async function measureModel({
                         type: "start",
                         modelPath,
                         useMmap,
+                        useDirectIo,
                         tests,
                         initialMaxContextSize,
                         maxContextSize,
@@ -828,19 +849,20 @@ async function runTestWorkerLogic() {
     }
 
     async function testWithGpuLayers({
-        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize,
-        evaluateText, exitAfterMeasurement = false
+        modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache,
+        batchSize, evaluateText, exitAfterMeasurement = false
     }: {
-        modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
-        minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string,
-        exitAfterMeasurement?: boolean
+        modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
+        maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number,
+        evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
             const preModelRamUsage = getMemoryUsage(llama);
             const model = await llama.loadModel({
                 modelPath,
                 useMmap,
+                useDirectIo,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
@@ -908,6 +930,7 @@ async function runTestWorkerLogic() {
                 const measurementsDone = await testWithGpuLayers({
                     modelPath: message.modelPath,
                     useMmap: message.useMmap,
+                    useDirectIo: message.useDirectIo,
                     gpuLayers,
                     tests: message.tests,
                     startContextSize: gpuLayers == message.maxGpuLayers
@@ -1005,6 +1028,7 @@ type ParentToChildMessage = {
     type: "start",
     modelPath: string,
     useMmap?: boolean,
+    useDirectIo?: boolean,
     tests: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts

Original file line number	Diff line number	Diff line change
`@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM`
`252`	`252`	`model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();`
`253`	`253`	`}`
`254`	`254`
	`255`	`+ if (options.Has("useDirectIo")) {`
	`256`	`+ model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();`
	`257`	`+ }`
	`258`	`+`
`255`	`259`	`if (options.Has("useMlock")) {`
`256`	`260`	`model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();`
`257`	`261`	`}`