Skip to content

Commit 3311de8

Browse files
committed
wip: tokenDecoder
1 parent 823d553 commit 3311de8

1 file changed

Lines changed: 45 additions & 8 deletions

File tree

src/tokenizers/tokenDecoder.ts

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,65 @@
1+
import * as FileSystem from 'expo-file-system';
2+
import { ResourceSource } from '../types/common';
3+
import { fetchResource } from '../utils/fetchResource';
14
import { unicodeToBytes } from '../utils/tokenizerUtils';
25

6+
/**
7+
* TokenDecoder class responsible for decoding token IDs into tokens and converting tokens into text.
8+
*/
39
export class TokenDecoder {
410
private vocab: any;
5-
private charDecoder;
11+
private unicodeToBytes;
612
private textDecoder;
713

8-
constructor(vocab: any) {
9-
this.vocab = vocab;
10-
this.charDecoder = unicodeToBytes();
14+
/**
15+
* Creates an instance of TokenDecoder.
16+
* @param {any} vocab - A mapping of token IDs to their corresponding string tokens.
17+
*/
18+
constructor(vocabSource: ResourceSource) {
19+
this.vocab = vocabSource; // FIXME: for now I assume that it's just a JSON from require()
20+
this.unicodeToBytes = unicodeToBytes();
1121
this.textDecoder = new TextDecoder('utf-8', { fatal: false });
1222
}
1323

14-
public tokenIdsToTokens(tokenIds: number[]) {
24+
/**
25+
* Fetches the vocabulary of the tokenizer which can later be used for mapping tokenIds to tokens.
26+
* @param {ResourceSource} source - URL to the tokenizer vocab to fetch
27+
* @returns {Promise<{ [key: number]: string }>} - A mapping of with tokenId as key and token as value.
28+
*/
29+
public async fetchVocab(
30+
source: ResourceSource
31+
): Promise<{ [key: number]: string }> {
32+
let tokenzerUri = await fetchResource(source);
33+
return JSON.parse(await FileSystem.readAsStringAsync(tokenzerUri));
34+
}
35+
36+
/**
37+
* Converts an array of token IDs into their corresponding token strings.
38+
* @param {number[]} tokenIds - An array of token IDs.
39+
* @returns {string[]} An array of token strings.
40+
*/
41+
public tokenIdsToTokens(tokenIds: number[]): string[] {
1542
return tokenIds.map((token) => this.vocab[token]);
1643
}
1744

18-
public tokenIdtoToken(tokenId: number) {
45+
/**
46+
* Converts a single token ID into its corresponding token string.
47+
* @param {number} tokenId - A single token ID.
48+
* @returns {string} The corresponding token string.
49+
*/
50+
public tokenIdtoToken(tokenId: number): string {
1951
return this.vocab[tokenId];
2052
}
2153

22-
public tokensToDecodedText(tokens: string[]) {
54+
/**
55+
* Decodes an array of tokens into a readable text string.
56+
* @param {string[]} tokens - An array of token strings.
57+
* @returns {string} The decoded text.
58+
*/
59+
public tokensToDecodedText(tokens: string[]): string {
2360
const stringifiedTokens = tokens.join('');
2461
const byteArray = Array.from(stringifiedTokens).map(
25-
(char) => this.charDecoder[char]
62+
(char) => this.unicodeToBytes[char]
2663
);
2764
const text = this.textDecoder.decode(
2865
new Uint8Array(byteArray as number[]),

0 commit comments

Comments
 (0)