Skip to content

Commit 2f2ef58

Browse files
committed
feat(LlamaModel): useDirectIo
1 parent ae1501d commit 2f2ef58

8 files changed

Lines changed: 110 additions & 19 deletions

File tree

llama/addon/AddonModel.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM
252252
model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
253253
}
254254

255+
if (options.Has("useDirectIo")) {
256+
model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();
257+
}
258+
255259
if (options.Has("useMlock")) {
256260
model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
257261
}

src/bindings/AddonTypes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export type BindingModule = {
99
gpuLayers?: number,
1010
vocabOnly?: boolean,
1111
useMmap?: boolean,
12+
useDirectIo?: boolean,
1213
useMlock?: boolean,
1314
checkTensors?: boolean,
1415
onLoadProgress?(loadPercentage: number): void,

src/cli/commands/ChatCommand.ts

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ type ChatCommand = {
7272
meter: boolean,
7373
timing: boolean,
7474
noMmap: boolean,
75+
noDirectIo: boolean,
7576
printTimings: boolean
7677
};
7778

@@ -329,6 +330,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
329330
default: false,
330331
description: "Disable mmap (memory-mapped file) usage"
331332
})
333+
.option("noDirectIo", {
334+
type: "boolean",
335+
default: false,
336+
description: "Disable Direct I/O usage when available"
337+
})
332338
.option("printTimings", {
333339
alias: "pt",
334340
type: "boolean",
@@ -342,7 +348,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
342348
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
343349
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
344350
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
345-
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
351+
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo,
352+
printTimings
346353
}) {
347354
try {
348355
await RunChat({
@@ -351,7 +358,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
351358
temperature, minP, topK, topP, seed,
352359
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
353360
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
354-
debug, numa, meter, timing, noMmap, printTimings
361+
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
355362
});
356363
} catch (err) {
357364
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -368,7 +375,7 @@ async function RunChat({
368375
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
369376
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
370377
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
371-
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
378+
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
372379
}: ChatCommand) {
373380
if (contextSize === -1) contextSize = undefined;
374381
if (gpuLayers === -1) gpuLayers = undefined;
@@ -395,6 +402,7 @@ async function RunChat({
395402
});
396403
const logBatchSize = batchSize != null;
397404
const useMmap = !noMmap && llama.supportsMmap;
405+
const useDirectIo = !noDirectIo;
398406

399407
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
400408
flashAttention,
@@ -452,6 +460,7 @@ async function RunChat({
452460
defaultContextFlashAttention: flashAttention,
453461
defaultContextSwaFullCache: swaFullCache,
454462
useMmap,
463+
useDirectIo,
455464
ignoreMemorySafetyChecks: gpuLayers != null,
456465
onLoadProgress(loadProgress: number) {
457466
progressUpdater.setProgress(loadProgress);
@@ -486,6 +495,7 @@ async function RunChat({
486495
defaultContextFlashAttention: flashAttention,
487496
defaultContextSwaFullCache: swaFullCache,
488497
useMmap,
498+
useDirectIo,
489499
onLoadProgress(loadProgress: number) {
490500
progressUpdater.setProgress(loadProgress);
491501
},
@@ -591,6 +601,7 @@ async function RunChat({
591601
context,
592602
draftContext,
593603
useMmap,
604+
useDirectIo,
594605
printBos: true,
595606
printEos: true,
596607
logBatchSize,

src/cli/commands/CompleteCommand.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ type CompleteCommand = {
5454
meter: boolean,
5555
timing: boolean,
5656
noMmap: boolean,
57+
noDirectIo: boolean,
5758
printTimings: boolean
5859
};
5960

@@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
249250
default: false,
250251
description: "Disable mmap (memory-mapped file) usage"
251252
})
253+
.option("noDirectIo", {
254+
type: "boolean",
255+
default: false,
256+
description: "Disable Direct I/O usage when available"
257+
})
252258
.option("printTimings", {
253259
alias: "pt",
254260
type: "boolean",
@@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
261267
flashAttention, swaFullCache, threads, temperature, minP, topK,
262268
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
263269
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
264-
debug, numa, meter, timing, noMmap, printTimings
270+
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
265271
}) {
266272
try {
267273
await RunCompletion({
268274
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
269275
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
270276
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
271-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
277+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
272278
});
273279
} catch (err) {
274280
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -283,7 +289,7 @@ async function RunCompletion({
283289
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
284290
threads, temperature, minP, topK, topP, seed, gpuLayers,
285291
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
286-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
292+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
287293
}: CompleteCommand) {
288294
if (contextSize === -1) contextSize = undefined;
289295
if (gpuLayers === -1) gpuLayers = undefined;
@@ -308,6 +314,7 @@ async function RunCompletion({
308314
});
309315
const logBatchSize = batchSize != null;
310316
const useMmap = !noMmap && llama.supportsMmap;
317+
const useDirectIo = !noDirectIo;
311318

312319
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
313320
flashAttention,
@@ -358,6 +365,7 @@ async function RunCompletion({
358365
defaultContextFlashAttention: flashAttention,
359366
defaultContextSwaFullCache: swaFullCache,
360367
useMmap,
368+
useDirectIo,
361369
ignoreMemorySafetyChecks: gpuLayers != null,
362370
onLoadProgress(loadProgress: number) {
363371
progressUpdater.setProgress(loadProgress);
@@ -392,6 +400,7 @@ async function RunCompletion({
392400
defaultContextFlashAttention: flashAttention,
393401
defaultContextSwaFullCache: swaFullCache,
394402
useMmap,
403+
useDirectIo,
395404
onLoadProgress(loadProgress: number) {
396405
progressUpdater.setProgress(loadProgress);
397406
},
@@ -470,6 +479,7 @@ async function RunCompletion({
470479
context,
471480
draftContext,
472481
useMmap,
482+
useDirectIo,
473483
minTitleLength: "Complete".length + 1,
474484
logBatchSize,
475485
tokenMeterEnabled: meter

src/cli/commands/InfillCommand.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ type InfillCommand = {
5656
meter: boolean,
5757
timing: boolean,
5858
noMmap: boolean,
59+
noDirectIo: boolean,
5960
printTimings: boolean
6061
};
6162

@@ -259,6 +260,11 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
259260
default: false,
260261
description: "Disable mmap (memory-mapped file) usage"
261262
})
263+
.option("noDirectIo", {
264+
type: "boolean",
265+
default: false,
266+
description: "Disable Direct I/O usage when available"
267+
})
262268
.option("printTimings", {
263269
alias: "pt",
264270
type: "boolean",
@@ -271,14 +277,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
271277
flashAttention, swaFullCache, threads, temperature, minP, topK,
272278
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
273279
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
274-
debug, numa, meter, timing, noMmap, printTimings
280+
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
275281
}) {
276282
try {
277283
await RunInfill({
278284
modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
279285
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
280286
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
281-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
287+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
282288
});
283289
} catch (err) {
284290
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -293,7 +299,7 @@ async function RunInfill({
293299
modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
294300
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
295301
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
296-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
302+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
297303
}: InfillCommand) {
298304
if (contextSize === -1) contextSize = undefined;
299305
if (gpuLayers === -1) gpuLayers = undefined;
@@ -318,6 +324,7 @@ async function RunInfill({
318324
});
319325
const logBatchSize = batchSize != null;
320326
const useMmap = !noMmap && llama.supportsMmap;
327+
const useDirectIo = !noDirectIo;
321328

322329
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
323330
flashAttention,
@@ -382,6 +389,7 @@ async function RunInfill({
382389
defaultContextFlashAttention: flashAttention,
383390
defaultContextSwaFullCache: swaFullCache,
384391
useMmap,
392+
useDirectIo,
385393
ignoreMemorySafetyChecks: gpuLayers != null,
386394
onLoadProgress(loadProgress: number) {
387395
progressUpdater.setProgress(loadProgress);
@@ -416,6 +424,7 @@ async function RunInfill({
416424
defaultContextFlashAttention: flashAttention,
417425
defaultContextSwaFullCache: swaFullCache,
418426
useMmap,
427+
useDirectIo,
419428
onLoadProgress(loadProgress: number) {
420429
progressUpdater.setProgress(loadProgress);
421430
},
@@ -494,6 +503,7 @@ async function RunInfill({
494503
context,
495504
draftContext,
496505
useMmap,
506+
useDirectIo,
497507
logBatchSize,
498508
tokenMeterEnabled: meter
499509
});

src/cli/commands/inspect/commands/InspectMeasureCommand.ts

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {documentationPageUrls} from "../../../../config.js";
2222
import {Llama} from "../../../../bindings/Llama.js";
2323
import {toBytes} from "../../../utils/toBytes.js";
2424
import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
25+
import {getPlatform} from "../../../../bindings/utils/getPlatform.js";
2526

2627
type InspectMeasureCommand = {
2728
modelPath?: string,
@@ -37,6 +38,7 @@ type InspectMeasureCommand = {
3738
measures: number,
3839
memory: "vram" | "ram" | "all",
3940
noMmap: boolean,
41+
noDirectIo: boolean,
4042
printHeaderBeforeEachLayer?: boolean,
4143
evaluateText?: string,
4244
repeatEvaluateText?: number
@@ -135,6 +137,11 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
135137
default: false,
136138
description: "Disable mmap (memory-mapped file) usage"
137139
})
140+
.option("noDirectIo", {
141+
type: "boolean",
142+
default: false,
143+
description: "Disable Direct I/O usage when available"
144+
})
138145
.option("printHeaderBeforeEachLayer", {
139146
alias: "ph",
140147
type: "boolean",
@@ -155,7 +162,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
155162
},
156163
async handler({
157164
modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache,
158-
batchSize, measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
165+
batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText,
166+
repeatEvaluateText
159167
}: InspectMeasureCommand) {
160168
if (maxLayers === -1) maxLayers = undefined;
161169
if (maxContextSize === -1) maxContextSize = undefined;
@@ -174,7 +182,9 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
174182
logLevel: LlamaLogLevel.error
175183
});
176184

185+
const platform = getPlatform();
177186
const useMmap = !noMmap && llama.supportsMmap;
187+
const useDirectIo = !noDirectIo;
178188
const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
179189
flashAttention, swaFullCache, useMmap
180190
});
@@ -188,6 +198,14 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
188198
? "enabled"
189199
: "disabled"
190200
));
201+
202+
if (platform !== "mac") // Direct I/O is not supported on macOS
203+
console.info(chalk.yellow("Direct I/O:") + " " + (
204+
useDirectIo
205+
? "enabled"
206+
: "disabled"
207+
));
208+
191209
if (measureMemoryType === "ram" || measureMemoryType === "all")
192210
console.warn(chalk.yellow("RAM measurements are greatly inaccurate due to OS optimizations that prevent released memory from being immediately available"));
193211

@@ -221,6 +239,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
221239
const done = await measureModel({
222240
modelPath: resolvedGgufPath,
223241
useMmap,
242+
useDirectIo,
224243
gpu: gpu == null
225244
? undefined
226245
: llama.gpu,
@@ -513,11 +532,12 @@ const detectedFileName = path.basename(__filename);
513532
const expectedFileName = "InspectMeasureCommand";
514533

515534
async function measureModel({
516-
modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
517-
swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
535+
modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers,
536+
flashAttention, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo
518537
}: {
519538
modelPath: string,
520539
useMmap?: boolean,
540+
useDirectIo?: boolean,
521541
gpu?: BuildGpu | "auto",
522542
tests: number,
523543
initialMaxContextSize?: number,
@@ -628,6 +648,7 @@ async function measureModel({
628648
type: "start",
629649
modelPath,
630650
useMmap,
651+
useDirectIo,
631652
tests,
632653
initialMaxContextSize,
633654
maxContextSize,
@@ -828,19 +849,20 @@ async function runTestWorkerLogic() {
828849
}
829850

830851
async function testWithGpuLayers({
831-
modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, batchSize,
832-
evaluateText, exitAfterMeasurement = false
852+
modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache,
853+
batchSize, evaluateText, exitAfterMeasurement = false
833854
}: {
834-
modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
835-
minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number, evaluateText?: string,
836-
exitAfterMeasurement?: boolean
855+
modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
856+
maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, batchSize?: number,
857+
evaluateText?: string, exitAfterMeasurement?: boolean
837858
}) {
838859
try {
839860
const preModelVramUsage = (await llama.getVramState()).used;
840861
const preModelRamUsage = getMemoryUsage(llama);
841862
const model = await llama.loadModel({
842863
modelPath,
843864
useMmap,
865+
useDirectIo,
844866
gpuLayers,
845867
defaultContextFlashAttention: flashAttention,
846868
defaultContextSwaFullCache: swaFullCache,
@@ -908,6 +930,7 @@ async function runTestWorkerLogic() {
908930
const measurementsDone = await testWithGpuLayers({
909931
modelPath: message.modelPath,
910932
useMmap: message.useMmap,
933+
useDirectIo: message.useDirectIo,
911934
gpuLayers,
912935
tests: message.tests,
913936
startContextSize: gpuLayers == message.maxGpuLayers
@@ -1005,6 +1028,7 @@ type ParentToChildMessage = {
10051028
type: "start",
10061029
modelPath: string,
10071030
useMmap?: boolean,
1031+
useDirectIo?: boolean,
10081032
tests: number,
10091033
maxGpuLayers: number,
10101034
minGpuLayers?: number,

0 commit comments

Comments
 (0)