diff --git a/docs/docs/02-hooks/01-natural-language-processing/useLLM.md b/docs/docs/02-hooks/01-natural-language-processing/useLLM.md index 37ce45b557..f88979cc4f 100644 --- a/docs/docs/02-hooks/01-natural-language-processing/useLLM.md +++ b/docs/docs/02-hooks/01-natural-language-processing/useLLM.md @@ -109,6 +109,7 @@ interface LLMType { toolsConfig?: ToolsConfig; generationConfig?: GenerationConfig; }) => void; + getGeneratedTokenCount: () => number; generate: (messages: Message[], tools?: LLMTool[]) => Promise; sendMessage: (message: string) => Promise; deleteMessage: (index: number) => void; @@ -129,6 +130,11 @@ interface ChatConfig { systemPrompt: string; } +interface GenerationConfig { + outputTokenBatchSize: number; + batchTimeInterval: number; +} + // tool calling interface ToolsConfig { tools: LLMTool[]; @@ -141,11 +147,6 @@ interface ToolCall { arguments: Object; } -interface GenerationConfig { - outputTokenBatchSize: number; - batchTimeInterval: number; -} - type LLMTool = Object; ``` diff --git a/docs/docs/03-typescript-api/01-natural-language-processing/LLMModule.md b/docs/docs/03-typescript-api/01-natural-language-processing/LLMModule.md index 00af4d561d..a76b028fb6 100644 --- a/docs/docs/03-typescript-api/01-natural-language-processing/LLMModule.md +++ b/docs/docs/03-typescript-api/01-natural-language-processing/LLMModule.md @@ -62,6 +62,11 @@ interface ChatConfig { systemPrompt: string; } +interface GenerationConfig { + outputTokenBatchSize: number; + batchTimeInterval: number; +} + // tool calling interface ToolsConfig { tools: LLMTool[]; @@ -69,11 +74,6 @@ interface ToolsConfig { displayToolCalls?: boolean; } -interface GenerationConfig { - outputTokenBatchSize: number; - batchTimeInterval: number; -} - interface ToolCall { toolName: string; arguments: Object; diff --git a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useLLM.md b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useLLM.md index 736b5fd114..f639a6cf62 100644 --- a/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useLLM.md +++ b/docs/versioned_docs/version-0.5.x/02-hooks/01-natural-language-processing/useLLM.md @@ -64,20 +64,21 @@ For more information on loading resources, take a look at [loading models](../.. ### Returns -| Field | Type | Description | -| ------------------ | --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `generate()` | `(messages: Message[], tools?: LLMTool[]) => Promise` | Runs model to complete chat passed in `messages` argument. It doesn't manage conversation context. | -| `interrupt()` | `() => void` | Function to interrupt the current inference. | -| `response` | `string` | State of the generated response. This field is updated with each token generated by the model. | -| `token` | `string` | The most recently generated token. | -| `isReady` | `boolean` | Indicates whether the model is ready. | -| `isGenerating` | `boolean` | Indicates whether the model is currently generating a response. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1, indicating the extent of the model file retrieval. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `configure` | `({ chatConfig?: Partial, toolsConfig?: ToolsConfig }) => void` | Configures chat and tool calling. See more details in [configuring the model](#configuring-the-model). | -| `sendMessage` | `(message: string) => Promise` | Function to add user message to conversation. After model responds, `messageHistory` will be updated with both user message and model response. | -| `deleteMessage` | `(index: number) => void` | Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. | -| `messageHistory` | `Message[]` | History containing all messages in conversation. This field is updated after model responds to `sendMessage`. | +| Field | Type | Description | +| ------------------------ | -------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `generate()` | `(messages: Message[], tools?: LLMTool[]) => Promise` | Runs model to complete chat passed in `messages` argument. It doesn't manage conversation context. | +| `interrupt()` | `() => void` | Function to interrupt the current inference. | +| `response` | `string` | State of the generated response. This field is updated with each token generated by the model. | +| `token` | `string` | The most recently generated token. | +| `isReady` | `boolean` | Indicates whether the model is ready. | +| `isGenerating` | `boolean` | Indicates whether the model is currently generating a response. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1, indicating the extent of the model file retrieval. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `configure` | `({chatConfig?: Partial, toolsConfig?: ToolsConfig, generationConfig?: GenerationConfig}) => void` | Configures chat and tool calling. See more details in [configuring the model](#configuring-the-model). | +| `sendMessage` | `(message: string) => Promise` | Function to add user message to conversation. After model responds, `messageHistory` will be updated with both user message and model response. | +| `deleteMessage` | `(index: number) => void` | Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. | +| `messageHistory` | `Message[]` | History containing all messages in conversation. This field is updated after model responds to `sendMessage`. | +| `getGeneratedTokenCount` | `() => number` | Returns the number of tokens generated in the last response. |
Type definitions @@ -106,10 +107,13 @@ interface LLMType { configure: ({ chatConfig, toolsConfig, + generationConfig, }: { chatConfig?: Partial; toolsConfig?: ToolsConfig; + generationConfig?: GenerationConfig; }) => void; + getGeneratedTokenCount: () => number; generate: (messages: Message[], tools?: LLMTool[]) => Promise; sendMessage: (message: string) => Promise; deleteMessage: (index: number) => void; @@ -130,6 +134,11 @@ interface ChatConfig { systemPrompt: string; } +interface GenerationConfig { + outputTokenBatchSize: number; + batchTimeInterval: number; +} + // tool calling interface ToolsConfig { tools: LLMTool[]; @@ -151,7 +160,7 @@ type LLMTool = Object; You can use functions returned from this hooks in two manners: -1. Functional/pure - we will not keep any state for you. You'll need to keep conversation history and handle function calling yourself. Use `generate` (and rarely `forward`) and `response`. Note that you don't need to run `configure` to use those. Furthermore, it will not have any effect on those functions. +1. Functional/pure - we will not keep any state for you. You'll need to keep conversation history and handle function calling yourself. Use `generate` (and rarely `forward`) and `response`. Note that you don't need to run `configure` to use those. Furthermore, `chatConfig` and `toolsConfig` will not have any effect on those functions. 2. Managed/stateful - we will manage conversation state. Tool calls will be parsed and called automatically after passing appropriate callbacks. See more at [managed LLM chat](#managed-llm-chat). @@ -271,6 +280,12 @@ To configure model (i.e. change system prompt, load initial conversation history - **`displayToolCalls`** - If set to true, JSON tool calls will be displayed in chat. If false, only answers will be displayed. +**`generationConfig`** - Object configuring generation settings, currently only output token batching. + +- **`outputTokenBatchSize`** - Soft upper limit on the number of tokens in each token batch (in certain cases there can be more tokens in given batch, i.e. when the batch would end with special emoji join character). + +- **`batchTimeInterval`** - Upper limit on the time interval between consecutive token batches. + ### Sending a message In order to send a message to the model, one can use the following code: @@ -463,6 +478,10 @@ The response should include JSON: } ``` +## Token Batching + +Depending on selected model and the user's device generation speed can be above 60 tokens per second. If the `tokenCallback` triggers rerenders and is invoked on every single token it can significantly decrease the app's performance. To alleviate this and help improve performance we've implemented token batching. To configure this you need to call `configure` method and pass `generationConfig`. Inside you can set two parameters `outputTokenBatchSize` and `batchTimeInterval`. They set the size of the batch before tokens are emitted and the maximum time interval between consecutive batches respectively. Each batch is emitted if either `timeInterval` elapses since last batch or `countInterval` number of tokens are generated. This allows for smooth generation even if model lags during generation. Default parameters are set to 10 tokens and 80ms for time interval (~12 batches per second). + ## Available models | Model Family | Sizes | Quantized | diff --git a/docs/versioned_docs/version-0.5.x/03-typescript-api/01-natural-language-processing/LLMModule.md b/docs/versioned_docs/version-0.5.x/03-typescript-api/01-natural-language-processing/LLMModule.md index 14d3b4d748..0a21978498 100644 --- a/docs/versioned_docs/version-0.5.x/03-typescript-api/01-natural-language-processing/LLMModule.md +++ b/docs/versioned_docs/version-0.5.x/03-typescript-api/01-natural-language-processing/LLMModule.md @@ -30,18 +30,19 @@ llm.delete(); ### Methods -| Method | Type | Description | -| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `constructor` | `({tokenCallback?: (token: string) => void, responseCallback?: (response: string) => void, messageHistoryCallback?: (messageHistory: Message[]) => void})` | Creates a new instance of LLMModule with optional callbacks. | -| `load` | `(model: { modelSource: ResourceSource; tokenizerSource: ResourceSource; tokenizerConfigSource: ResourceSource }, onDownloadProgressCallback?: (progress: number) => void): Promise` | Loads the model. | -| `setTokenCallback` | `{tokenCallback: (token: string) => void}) => void` | Sets new token callback. | -| `generate` | `(messages: Message[], tools?: LLMTool[]) => Promise` | Runs model to complete chat passed in `messages` argument. It doesn't manage conversation context. | -| `forward` | `(input: string) => Promise` | Runs model inference with raw input string. You need to provide entire conversation and prompt (in correct format and with special tokens!) in input string to this method. It doesn't manage conversation context. It is intended for users that need access to the model itself without any wrapper. If you want a simple chat with model the consider using`sendMessage` | -| `configure` | `({chatConfig?: Partial, toolsConfig?: ToolsConfig}) => void` | Configures chat and tool calling. See more details in [configuring the model](#configuring-the-model). | -| `sendMessage` | `(message: string) => Promise` | Method to add user message to conversation. After model responds it will call `messageHistoryCallback()`containing both user message and model response. It also returns them. | -| `deleteMessage` | `(index: number) => void` | Deletes all messages starting with message on `index` position. After deletion it will call `messageHistoryCallback()` containing new history. It also returns it. | -| `delete` | `() => void` | Method to delete the model from memory. Note you cannot delete model while it's generating. You need to interrupt it first and make sure model stopped generation. | -| `interrupt` | `() => void` | Interrupts model generation. It may return one more token after interrupt. | +| Method | Type | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `constructor` | `({tokenCallback?: (token: string) => void, responseCallback?: (response: string) => void, messageHistoryCallback?: (messageHistory: Message[]) => void})` | Creates a new instance of LLMModule with optional callbacks. | +| `load` | `(model: { modelSource: ResourceSource; tokenizerSource: ResourceSource; tokenizerConfigSource: ResourceSource }, onDownloadProgressCallback?: (progress: number) => void): Promise` | Loads the model. | +| `setTokenCallback` | `{tokenCallback: (token: string) => void}) => void` | Sets new token callback. | +| `generate` | `(messages: Message[], tools?: LLMTool[]) => Promise` | Runs model to complete chat passed in `messages` argument. It doesn't manage conversation context. | +| `forward` | `(input: string) => Promise` | Runs model inference with raw input string. You need to provide entire conversation and prompt (in correct format and with special tokens!) in input string to this method. It doesn't manage conversation context. It is intended for users that need access to the model itself without any wrapper. If you want a simple chat with model the consider using`sendMessage` | +| `configure` | `({chatConfig?: Partial, toolsConfig?: ToolsConfig, generationConfig?: GenerationConfig}) => void` | Configures chat and tool calling and generation settings. See more details in [configuring the model](#configuring-the-model). | +| `sendMessage` | `(message: string) => Promise` | Method to add user message to conversation. After model responds it will call `messageHistoryCallback()`containing both user message and model response. It also returns them. | +| `deleteMessage` | `(index: number) => void` | Deletes all messages starting with message on `index` position. After deletion it will call `messageHistoryCallback()` containing new history. It also returns it. | +| `delete` | `() => void` | Method to delete the model from memory. Note you cannot delete model while it's generating. You need to interrupt it first and make sure model stopped generation. | +| `interrupt` | `() => void` | Interrupts model generation. It may return one more token after interrupt. | +| `getGeneratedTokenCount` | `() => number` | Returns the number of tokens generated in the last response. |
Type definitions @@ -61,6 +62,11 @@ interface ChatConfig { systemPrompt: string; } +interface GenerationConfig { + outputTokenBatchSize: number; + batchTimeInterval: number; +} + // tool calling interface ToolsConfig { tools: LLMTool[]; @@ -124,6 +130,10 @@ To subscribe to the token generation event, you can pass `tokenCallback` or `mes In order to interrupt the model, you can use the `interrupt` method. +## Token Batching + +Depending on selected model and the user's device generation speed can be above 60 tokens per second. If the `tokenCallback` triggers rerenders and is invoked on every single token it can significantly decrease the app's performance. To alleviate this and help improve performance we've implemented token batching. To configure this you need to call `configure` method and pass `generationConfig`. Inside you can set two parameters `outputTokenBatchSize` and `batchTimeInterval`. They set the size of the batch before tokens are emitted and the maximum time interval between consecutive batches respectively. Each batch is emitted if either `timeInterval` elapses since last batch or `countInterval` number of tokens are generated. This allows for smooth generation even if model lags during generation. Default parameters are set to 10 tokens and 80ms for time interval (~12 batches per second). + ## Configuring the model To configure model (i.e. change system prompt, load initial conversation history or manage tool calling) you can use @@ -145,6 +155,12 @@ To configure model (i.e. change system prompt, load initial conversation history - **`displayToolCalls`** - If set to true, JSON tool calls will be displayed in chat. If false, only answers will be displayed. +**`generationConfig`** - Object configuring generation settings, currently only output token batching. + +- **`outputTokenBatchSize`** - Soft upper limit on the number of tokens in each token batch (in certain cases there can be more tokens in given batch, i.e. when the batch would end with special emoji join character). + +- **`batchTimeInterval`** - Upper limit on the time interval between consecutive token batches. + ## Deleting the model from memory To delete the model from memory, you can use the `delete` method. diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts index 778e4bf524..d4b46f4cd6 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/LLMModule.ts @@ -85,6 +85,10 @@ export class LLMModule { this.controller.interrupt(); } + getGeneratedTokenCount() { + return this.controller.getGeneratedTokenCount(); + } + delete() { this.controller.delete(); } diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 2d0a60c981..043fe966bf 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -6,7 +6,6 @@ export interface LLMType { isGenerating: boolean; downloadProgress: number; error: string | null; - getGeneratedTokenCount: () => number; configure: ({ chatConfig, toolsConfig, @@ -16,6 +15,7 @@ export interface LLMType { toolsConfig?: ToolsConfig; generationConfig?: GenerationConfig; }) => void; + getGeneratedTokenCount: () => number; generate: (messages: Message[], tools?: LLMTool[]) => Promise; sendMessage: (message: string) => Promise; deleteMessage: (index: number) => void;