Skip to content

Commit b8f9f16

Browse files
chore: developing the package
1 parent c19eb62 commit b8f9f16

6 files changed

Lines changed: 124 additions & 6 deletions

File tree

embedder/index.js

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import { mkdir, readFile, writeFile } from 'node:fs/promises'
2+
import { dirname } from 'node:path'
3+
4+
const js = String.raw
5+
6+
const modelPath = './models/quantized_models/eng/model.int4.onnx'
7+
const modelDataPath = './models/quantized_models/eng/model.int4.onnx.data'
8+
const tokenizerModelPath = './models/quantized_models/eng/tokenizer.model'
9+
10+
const outPath = './src/Model/class.ts'
11+
12+
function toUint8ArraySource(bytes) {
13+
return `new Uint8Array([${Array.from(bytes).join(',')}])`
14+
}
15+
16+
const [model, modelData, tokenizerModel] = await Promise.all([
17+
readFile(modelPath),
18+
readFile(modelDataPath),
19+
readFile(tokenizerModelPath),
20+
])
21+
22+
const ts = js`
23+
import * as ort from 'onnxruntime-web'
24+
import { SentencePieceProcessor } from '@agnai/sentencepiece-js'
25+
26+
export async function createInferenceSession(): Promise<ort.InferenceSession> {
27+
return ort.InferenceSession.create(${toUint8ArraySource(model)}, {
28+
externalData: [
29+
{
30+
path: 'model.int4.onnx.data',
31+
data: ${toUint8ArraySource(modelData)},
32+
},
33+
],
34+
})
35+
}
36+
37+
export async function createTokenProcessor():Promise<SentencePieceProcessor> {
38+
const tokenProcessor = new SentencePieceProcessor()
39+
await tokenProcessor.load(${toUint8ArraySource(tokenizerModel)})
40+
return tokenProcessor
41+
}
42+
`.trimStart()
43+
44+
await mkdir(dirname(outPath), { recursive: true })
45+
await writeFile(outPath, ts, 'utf8')

src/.types/index.d.ts

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
declare module '@agnai/sentencepiece-js' {
2+
export interface SentencePieceStatus {
3+
delete(): void
4+
}
5+
6+
export interface SentencePieceVector<T> {
7+
size(): number
8+
get(index: number): T
9+
delete(): void
10+
}
11+
12+
export interface SentencePieceStringViewHandle {
13+
delete(): void
14+
}
15+
16+
export interface SentencePieceStringView {
17+
getView(): SentencePieceStringViewHandle
18+
delete(): void
19+
}
20+
21+
export interface SentencePieceProcessorBinding {
22+
Load(model: SentencePieceStringViewHandle): SentencePieceStatus
23+
EncodeAsIds(text: SentencePieceStringViewHandle): SentencePieceVector<number>
24+
EncodeAsPieces(text: SentencePieceStringViewHandle): SentencePieceVector<string>
25+
DecodeIds(ids: SentencePieceVector<number>): string
26+
LoadVocabulary(vocab: SentencePieceStringViewHandle, threshold: number): SentencePieceStatus
27+
}
28+
29+
export interface SentencePieceModule {
30+
FS: {
31+
writeFile(path: string, data: string | Uint8Array | ArrayBufferLike): void
32+
}
33+
StringView: new (value: string) => SentencePieceStringView
34+
SentencePieceProcessor: new () => SentencePieceProcessorBinding
35+
vecFromJSArray(values: readonly number[]): SentencePieceVector<number>
36+
}
37+
38+
export class SentencePieceProcessor {
39+
sentencepiece?: SentencePieceModule
40+
processor?: SentencePieceProcessorBinding
41+
42+
constructor()
43+
load(url: string): Promise<void>
44+
encodeIds(text: string): number[]
45+
encodePieces(text: string): string[]
46+
decodeIds(ids: readonly number[]): string
47+
loadVocabulary(url: string): void
48+
}
49+
50+
export function cleanText(text: string): string
51+
52+
const defaultExport: {
53+
SentencePieceProcessor: typeof SentencePieceProcessor
54+
cleanText: typeof cleanText
55+
}
56+
57+
export default defaultExport
58+
}

src/.types/index.ts

Whitespace-only changes.

src/Model/class.ts

Lines changed: 19 additions & 0 deletions
Large diffs are not rendered by default.

src/index.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
1-
import { createWorker } from 'tesseract.js'
2-
3-
const ocrWorker = await createWorker(['fin', 'swe', 'eng'])
4-
5-
ocrWorker.readText()
1+
export { createInferenceSession, createTokenProcessor } from './Model/class.js'

tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
"skipLibCheck": true,
1313
"lib": ["es2024", "dom", "dom.iterable", "webworker"]
1414
},
15-
"include": ["src", "node_modules/scribe.js-ocr/js/global.d.ts"]
15+
"include": ["src", "src/.types/index.d.ts"]
1616
}

0 commit comments

Comments
 (0)