Skip to content

Commit 346f74d

Browse files
feat: add support for hugging face tokenizers, add executorch and tokenizers as submodules (#144)
## Description This PR introduces support for hugging face tokenizers by utilizing cpp bridge that connects native code with hf tokenizers library. It also features configured submodules which point to our executorch and tokenizers-cpp repositories that are stored in software mansion labs organization. ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] Documentation update (improves or adds clarity to existing documentation) ### Tested on - [x] iOS - [x] Android ### Testing instructions <!-- Provide step-by-step instructions on how to test your changes. Include setup details if necessary. --> ### Screenshots <!-- Add screenshots here, if applicable --> ### Related issues <!-- Link related issues here using #issue-number --> ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes <!-- Include any additional information, assumptions, or context that reviewers might need to understand this PR. -->
1 parent 7ec9f07 commit 346f74d

File tree

23 files changed

+1705
-52
lines changed

23 files changed

+1705
-52
lines changed

.gitmodules

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[submodule "executorch"]
2+
path = third-party/executorch
3+
url = https://github.com/software-mansion-labs/executorch
4+
[submodule "tokenizers-cpp"]
5+
path = third-party/tokenizers-cpp
6+
url = https://github.com/software-mansion-labs/tokenizers-cpp

android/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ dependencies {
103103
implementation "com.facebook.react:react-android:+"
104104
implementation 'org.opencv:opencv:4.10.0'
105105
implementation "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
106-
implementation 'com.github.software-mansion:react-native-executorch:main-SNAPSHOT'
106+
implementation(files("libs/executorch.aar"))
107107
implementation 'org.opencv:opencv:4.10.0'
108108
implementation("com.squareup.okhttp3:okhttp:4.9.2")
109109
}

android/libs/executorch.aar

58 MB
Binary file not shown.

android/src/main/java/com/swmansion/rnexecutorch/RnExecutorchPackage.kt

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ class RnExecutorchPackage : TurboReactPackage() {
3232
VerticalOCR(reactContext)
3333
} else if (name == ImageSegmentation.NAME) {
3434
ImageSegmentation(reactContext)
35+
} else if (name == Tokenizer.NAME) {
36+
Tokenizer(reactContext)
3537
} else {
3638
null
3739
}
@@ -118,12 +120,25 @@ class RnExecutorchPackage : TurboReactPackage() {
118120
true,
119121
)
120122

121-
moduleInfos[ImageSegmentation.NAME] = ReactModuleInfo(
122-
ImageSegmentation.NAME, ImageSegmentation.NAME, false, // canOverrideExistingModule
123-
false, // needsEagerInit
124-
false, // isCxxModule
125-
true
126-
)
123+
moduleInfos[ImageSegmentation.NAME] =
124+
ReactModuleInfo(
125+
ImageSegmentation.NAME,
126+
ImageSegmentation.NAME,
127+
false, // canOverrideExistingModule
128+
false, // needsEagerInit
129+
false, // isCxxModule
130+
true,
131+
)
132+
133+
moduleInfos[Tokenizer.NAME] =
134+
ReactModuleInfo(
135+
Tokenizer.NAME,
136+
Tokenizer.NAME,
137+
false, // canOverrideExistingModule
138+
false, // needsEagerInit
139+
false, // isCxxModule
140+
true,
141+
)
127142
moduleInfos
128143
}
129144
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
package com.swmansion.rnexecutorch
2+
3+
import com.facebook.react.bridge.Promise
4+
import com.facebook.react.bridge.ReactApplicationContext
5+
import com.facebook.react.bridge.ReadableArray
6+
import com.swmansion.rnexecutorch.utils.ArrayUtils.Companion.createIntArray
7+
import com.swmansion.rnexecutorch.utils.ArrayUtils.Companion.createReadableArrayFromIntArray
8+
import com.swmansion.rnexecutorch.utils.ETError
9+
import org.pytorch.executorch.HuggingFaceTokenizer
10+
import java.net.URL
11+
12+
class Tokenizer(
13+
reactContext: ReactApplicationContext,
14+
) : NativeTokenizerSpec(reactContext) {
15+
private lateinit var tokenizer: HuggingFaceTokenizer
16+
17+
companion object {
18+
const val NAME = "Tokenizer"
19+
}
20+
21+
override fun load(
22+
tokenizerSource: String,
23+
promise: Promise,
24+
) {
25+
try {
26+
tokenizer = HuggingFaceTokenizer(URL(tokenizerSource).path)
27+
promise.resolve(0)
28+
} catch (e: Exception) {
29+
promise.reject(e.message!!, ETError.InvalidModelSource.toString())
30+
}
31+
}
32+
33+
override fun decode(
34+
input: ReadableArray,
35+
promise: Promise,
36+
) {
37+
try {
38+
promise.resolve(tokenizer.decode(createIntArray(input)))
39+
} catch (e: Exception) {
40+
promise.reject(e.message!!, ETError.UndefinedError.toString())
41+
}
42+
}
43+
44+
override fun encode(
45+
input: String,
46+
promise: Promise,
47+
) {
48+
try {
49+
promise.resolve(createReadableArrayFromIntArray(tokenizer.encode(input)))
50+
} catch (e: Exception) {
51+
promise.reject(e.message!!, ETError.UndefinedError.toString())
52+
}
53+
}
54+
55+
override fun getVocabSize(promise: Promise) {
56+
try {
57+
promise.resolve(tokenizer.vocabSize)
58+
} catch (e: Exception) {
59+
promise.reject(e.message!!, ETError.UndefinedError.toString())
60+
}
61+
}
62+
63+
override fun idToToken(
64+
id: Double,
65+
promise: Promise,
66+
) {
67+
try {
68+
promise.resolve(tokenizer.idToToken(id.toInt()))
69+
} catch (e: Exception) {
70+
promise.reject(e.message!!, ETError.UndefinedError.toString())
71+
}
72+
}
73+
74+
override fun tokenToId(
75+
token: String,
76+
promise: Promise,
77+
) {
78+
try {
79+
promise.resolve(tokenizer.tokenToId(token))
80+
} catch (e: Exception) {
81+
promise.reject(e.message!!, ETError.UndefinedError.toString())
82+
}
83+
}
84+
85+
override fun getName(): String = NAME
86+
}
Binary file not shown.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#import <Foundation/Foundation.h>
2+
3+
@interface HuggingFaceTokenizer : NSObject
4+
5+
- (instancetype)initWithTokenizerPath:(NSString *)tokenizerPath;
6+
- (NSArray<NSNumber *> *)encode:(NSString *)text;
7+
- (NSString *)decode:(NSArray<NSNumber *> *)tokenIds;
8+
- (NSUInteger)getVocabSize;
9+
- (NSString *)idToToken:(NSInteger)tokenId;
10+
- (NSInteger)tokenToId:(NSString *)token;
11+
12+
@end
Binary file not shown.

ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/_CodeSignature/CodeResources

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,19 @@
66
<dict>
77
<key>Headers/ETModel.h</key>
88
<data>
9-
CFAz750OjepOG7MVBPABGfKHNeI=
9+
uR77dUqxDWLwIE0h9dvXHSjMFWo=
10+
</data>
11+
<key>Headers/HuggingFaceTokenizer.h</key>
12+
<data>
13+
cVZsliuTmV2umgK601d6PamTRSw=
1014
</data>
1115
<key>Headers/LLaMARunner.h</key>
1216
<data>
13-
SU8Fo2gR+gVVl9IplHgBJBRh1gQ=
17+
7OcOQPzHkK7faqsbtXBztEr9VO0=
1418
</data>
1519
<key>Info.plist</key>
1620
<data>
17-
l3rE2nBARVh++WIyHCfeHXD6Ewo=
21+
wBJJ2TZXoQXEBBiUjBovczXIsbU=
1822
</data>
1923
</dict>
2024
<key>files2</key>
@@ -23,14 +27,21 @@
2327
<dict>
2428
<key>hash2</key>
2529
<data>
26-
UXFd6a5OARqV4JnB+Jm4uqmt15aUmnXSOLPQKZTWZCc=
30+
+Ty+KzH7+xNA5B9kfmD44hgTJKIZuk2qN1kauF6thOw=
31+
</data>
32+
</dict>
33+
<key>Headers/HuggingFaceTokenizer.h</key>
34+
<dict>
35+
<key>hash2</key>
36+
<data>
37+
0ETM5qw12+W8ULx2zP2UkVomFrBRnwAr8I7po2ACk/k=
2738
</data>
2839
</dict>
2940
<key>Headers/LLaMARunner.h</key>
3041
<dict>
3142
<key>hash2</key>
3243
<data>
33-
or8gFkCO2QVkQgaeFAaqs/WqGjv8kABL8Rzcdcuexw0=
44+
AZECdFqfkT4YWu8Nqga5ALaJUme8IOAnhOaAUM+iOvU=
3445
</data>
3546
</dict>
3647
</dict>
Binary file not shown.

0 commit comments

Comments
 (0)