|
| 1 | +import * as FileSystem from 'expo-file-system'; |
| 2 | +import { ResourceSource } from '../types/common'; |
| 3 | +import { fetchResource } from '../utils/fetchResource'; |
1 | 4 | import { unicodeToBytes } from '../utils/tokenizerUtils'; |
2 | 5 |
|
| 6 | +/** |
| 7 | + * TokenDecoder class responsible for decoding token IDs into tokens and converting tokens into text. |
| 8 | + */ |
3 | 9 | export class TokenDecoder { |
4 | 10 | private vocab: any; |
5 | | - private charDecoder; |
| 11 | + private unicodeToBytes; |
6 | 12 | private textDecoder; |
7 | 13 |
|
8 | | - constructor(vocab: any) { |
9 | | - this.vocab = vocab; |
10 | | - this.charDecoder = unicodeToBytes(); |
| 14 | + /** |
| 15 | + * Creates an instance of TokenDecoder. |
| 16 | + * @param {any} vocab - A mapping of token IDs to their corresponding string tokens. |
| 17 | + */ |
| 18 | + constructor(vocabSource: ResourceSource) { |
| 19 | + this.vocab = vocabSource; // FIXME: for now I assume that it's just a JSON from require() |
| 20 | + this.unicodeToBytes = unicodeToBytes(); |
11 | 21 | this.textDecoder = new TextDecoder('utf-8', { fatal: false }); |
12 | 22 | } |
13 | 23 |
|
14 | | - public tokenIdsToTokens(tokenIds: number[]) { |
| 24 | + /** |
| 25 | + * Fetches the vocabulary of the tokenizer which can later be used for mapping tokenIds to tokens. |
| 26 | + * @param {ResourceSource} source - URL to the tokenizer vocab to fetch |
| 27 | + * @returns {Promise<{ [key: number]: string }>} - A mapping of with tokenId as key and token as value. |
| 28 | + */ |
| 29 | + public async fetchVocab( |
| 30 | + source: ResourceSource |
| 31 | + ): Promise<{ [key: number]: string }> { |
| 32 | + let tokenzerUri = await fetchResource(source); |
| 33 | + return JSON.parse(await FileSystem.readAsStringAsync(tokenzerUri)); |
| 34 | + } |
| 35 | + |
| 36 | + /** |
| 37 | + * Converts an array of token IDs into their corresponding token strings. |
| 38 | + * @param {number[]} tokenIds - An array of token IDs. |
| 39 | + * @returns {string[]} An array of token strings. |
| 40 | + */ |
| 41 | + public tokenIdsToTokens(tokenIds: number[]): string[] { |
15 | 42 | return tokenIds.map((token) => this.vocab[token]); |
16 | 43 | } |
17 | 44 |
|
18 | | - public tokenIdtoToken(tokenId: number) { |
| 45 | + /** |
| 46 | + * Converts a single token ID into its corresponding token string. |
| 47 | + * @param {number} tokenId - A single token ID. |
| 48 | + * @returns {string} The corresponding token string. |
| 49 | + */ |
| 50 | + public tokenIdtoToken(tokenId: number): string { |
19 | 51 | return this.vocab[tokenId]; |
20 | 52 | } |
21 | 53 |
|
22 | | - public tokensToDecodedText(tokens: string[]) { |
| 54 | + /** |
| 55 | + * Decodes an array of tokens into a readable text string. |
| 56 | + * @param {string[]} tokens - An array of token strings. |
| 57 | + * @returns {string} The decoded text. |
| 58 | + */ |
| 59 | + public tokensToDecodedText(tokens: string[]): string { |
23 | 60 | const stringifiedTokens = tokens.join(''); |
24 | 61 | const byteArray = Array.from(stringifiedTokens).map( |
25 | | - (char) => this.charDecoder[char] |
| 62 | + (char) => this.unicodeToBytes[char] |
26 | 63 | ); |
27 | 64 | const text = this.textDecoder.decode( |
28 | 65 | new Uint8Array(byteArray as number[]), |
|
0 commit comments