Skip to content

Commit 6a10b1d

Browse files
Merge pull request #784 from SKaiNET-developers/feature/data-source-huggingface
Feature/data source huggingface
2 parents 83b06d0 + c6c9be3 commit 6a10b1d

28 files changed

Lines changed: 1766 additions & 262 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ Runnable examples:
223223
### Data and I/O
224224

225225
- Built-in loaders: MNIST, Fashion-MNIST, CIFAR-10
226+
- URI-backed data sources: `file://`, `https://`, `hf+https://`, and `hf://...`
226227
- Formats: GGUF, ONNX, SafeTensors, JSON, Image (JPEG, PNG)
227228
- Type-safe transform DSL: resize, crop, normalize, toTensor
228229

build.gradle.kts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ dependencies {
151151

152152
// skainet-data
153153
dokka(project(":skainet-data:skainet-data-api"))
154+
dokka(project(":skainet-data:skainet-data-source"))
154155
dokka(project(":skainet-data:skainet-data-transform"))
155156
dokka(project(":skainet-data:skainet-data-simple"))
156157
dokka(project(":skainet-data:skainet-data-media"))
@@ -178,4 +179,4 @@ tasks.register<Copy>("bundleDokkaIntoSite") {
178179
dependsOn("dokkaGenerate")
179180
from(layout.buildDirectory.dir("dokka/html"))
180181
into(layout.projectDirectory.dir("docs/build/site/api"))
181-
}
182+
}

docs/modules/ROOT/nav.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Tutorials
66
** xref:tutorials/kotlin-getting-started.adoc[Kotlin getting started]
77
** xref:tutorials/java-getting-started.adoc[Java getting started]
8+
** xref:tutorials/data-sources-getting-started.adoc[Data sources and Hugging Face]
89
** xref:tutorials/image-data-getting-started.adoc[Image and data API]
910
** xref:tutorials/hlo-getting-started.adoc[StableHLO getting started]
1011
** xref:tutorials/minerva-getting-started.adoc[Minerva getting started]
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
== Data sources and Hugging Face
2+
3+
SKaiNET separates artifact resolution from dataset parsing and preprocessing.
4+
Use `skainet-data-source` when a dataset, tokenizer, model sidecar, or fixture
5+
can live either on disk or behind a remote URI.
6+
7+
[cols="1,3",options="header"]
8+
|===
9+
| URI form | Meaning
10+
| `file:///path/to/file`
11+
| Read a local file.
12+
13+
| `https://host/path/file`
14+
| Download and cache a generic remote artifact.
15+
16+
| `hf+https://huggingface.co/org/repo/resolve/main/file`
17+
| Treat a Hugging Face resolve URL as a Hugging Face artifact.
18+
19+
| `hf://org/repo@main/path/file`
20+
| Expand to a Hugging Face model repository resolve URL.
21+
22+
| `hf://datasets/org/repo@main/path/file`
23+
| Expand to a Hugging Face dataset repository resolve URL.
24+
|===
25+
26+
=== Add the modules
27+
28+
For JVM consumers, add the source module beside the data loaders you use:
29+
30+
[source,kotlin]
31+
----
32+
dependencies {
33+
implementation(platform("sk.ainet:skainet-bom:0.32.4"))
34+
35+
implementation("sk.ainet.core:skainet-data-source-jvm")
36+
implementation("sk.ainet.core:skainet-data-simple-jvm")
37+
}
38+
----
39+
40+
=== Resolve one artifact
41+
42+
`JvmDataSourceResolver` materializes remote artifacts into a cache and returns
43+
a `DataSourceArtifact` that opens a `kotlinx.io.Source`. Public Hugging Face
44+
files do not need credentials. Private files should pass an explicit
45+
`DataSourceAuthToken` on the request or resolver. Existing `Authorization`
46+
headers still take precedence. On JVM, the resolver can also read `HF_TOKEN` /
47+
`HUGGING_FACE_HUB_TOKEN` from the environment as an opt-in convenience fallback.
48+
49+
[source,kotlin]
50+
----
51+
import sk.ainet.data.source.DataSourceAuthToken
52+
import sk.ainet.data.source.DataSourceRequest
53+
import sk.ainet.data.source.JvmDataSourceResolver
54+
55+
val resolver = JvmDataSourceResolver(
56+
huggingFaceToken = DataSourceAuthToken.from("hf_...")
57+
)
58+
val artifact = resolver.resolve(
59+
DataSourceRequest(
60+
uri = "hf+https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json"
61+
)
62+
)
63+
64+
println(artifact.filename)
65+
println(artifact.localPath)
66+
67+
val source = artifact.openSource()
68+
try {
69+
// Pass the source to a parser/loader for model-sized artifacts.
70+
} finally {
71+
source.close()
72+
}
73+
74+
// Convenience for small sidecars and tests.
75+
val bytes = artifact.readBytes()
76+
----
77+
78+
For per-request credentials, pass the token directly on `DataSourceRequest`.
79+
This is useful when one resolver works with more than one private repository:
80+
81+
[source,kotlin]
82+
----
83+
val privateArtifact = resolver.resolve(
84+
DataSourceRequest(
85+
uri = "hf://datasets/your-org/private-dataset@main/data/train.bin",
86+
huggingFaceToken = DataSourceAuthToken.from("hf_...")
87+
)
88+
)
89+
----
90+
91+
To opt into JVM environment fallback:
92+
93+
[source,kotlin]
94+
----
95+
val resolver = JvmDataSourceResolver(
96+
useEnvironmentHuggingFaceToken = true
97+
)
98+
----
99+
100+
=== Use sources with built-in loaders
101+
102+
MNIST and Fashion-MNIST expose per-file URI overrides. CIFAR-10 exposes an
103+
archive URI override. Defaults still point to the historical public dataset
104+
locations, so existing code keeps working.
105+
106+
[source,kotlin]
107+
----
108+
import sk.ainet.data.mnist.MNIST
109+
import sk.ainet.data.mnist.MNISTLoaderConfig
110+
111+
val token = "hf_..."
112+
val train = MNIST.loadTrain(
113+
MNISTLoaderConfig(
114+
trainImagesUri = "file:///datasets/mnist/train-images-idx3-ubyte",
115+
trainLabelsUri = "hf+https://huggingface.co/your-org/mnist-idx/resolve/main/train-labels-idx1-ubyte.gz",
116+
huggingFaceTokenProvider = { token }
117+
)
118+
)
119+
120+
val batches = train.batchIterator<sk.ainet.lang.types.Int8, Byte>(batchSize = 64)
121+
----
122+
123+
=== Cache behavior
124+
125+
Use `CachePolicy.Use` for normal operation, `Refresh` to re-download,
126+
`Offline` to require a cached copy, and `Bypass` to avoid writing the cache.
127+
Built-in JVM loaders map `useCache = true` to `Use` and `useCache = false`
128+
to `Refresh`.
129+
130+
[source,kotlin]
131+
----
132+
import sk.ainet.data.source.CachePolicy
133+
import sk.ainet.data.source.DataSourceRequest
134+
135+
val refreshed = resolver.resolve(
136+
DataSourceRequest(
137+
uri = "hf://datasets/your-org/your-dataset@main/data/train-00000.parquet",
138+
cachePolicy = CachePolicy.Refresh
139+
)
140+
)
141+
----
142+
143+
=== Keep preprocessing separate
144+
145+
After bytes are parsed into a dataset, continue using the existing transform
146+
DSL for image/tensor preprocessing:
147+
148+
[source,kotlin]
149+
----
150+
import sk.ainet.data.transform.mnistPreprocessing
151+
152+
val preprocessing = mnistPreprocessing(ctx)
153+
----

settings.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ include("skainet-backends:benchmarks:jvm-cpu-publish")
4848

4949
// ====== DATA
5050
include("skainet-data:skainet-data-api")
51+
include("skainet-data:skainet-data-source")
5152
include("skainet-data:skainet-data-transform")
5253
include("skainet-data:skainet-data-simple")
5354
include("skainet-data:skainet-data-media")

skainet-data/skainet-data-simple/build.gradle.kts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ kotlin {
6262
}
6363

6464
jvmMain.dependencies {
65+
implementation(project(":skainet-data:skainet-data-source"))
6566
implementation(libs.ktor.client.cio)
6667
implementation(libs.ktor.client.plugins)
6768
implementation(libs.ktor.client.logging)

skainet-data/skainet-data-simple/src/commonMain/kotlin/sk/ainet/data/cifar10/CIFAR10Data.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import sk.ainet.context.DefaultDataExecutionContext
66
import sk.ainet.context.ExecutionContext
77
import sk.ainet.data.DataBatch
88
import sk.ainet.data.Dataset
9+
import sk.ainet.data.common.DatasetHuggingFaceTokenProvider
910
import sk.ainet.lang.tensor.Shape
1011
import sk.ainet.lang.tensor.Tensor
1112
import sk.ainet.lang.types.DType
@@ -144,7 +145,10 @@ public data class CIFAR10Dataset(
144145
*/
145146
public data class CIFAR10LoaderConfig(
146147
val cacheDir: String = "cifar10-data",
147-
val useCache: Boolean = true
148+
val useCache: Boolean = true,
149+
val archiveUri: String = CIFAR10Constants.DOWNLOAD_URL,
150+
val huggingFaceTokenProvider: DatasetHuggingFaceTokenProvider? = null,
151+
val useEnvironmentHuggingFaceToken: Boolean = false
148152
)
149153

150154
/**
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package sk.ainet.data.common
2+
3+
/**
4+
* Supplies a Hugging Face token for built-in dataset loaders when their source
5+
* URIs point at private Hugging Face artifacts.
6+
*/
7+
public fun interface DatasetHuggingFaceTokenProvider {
8+
public fun token(): String?
9+
}

skainet-data/skainet-data-simple/src/commonMain/kotlin/sk/ainet/data/fashionmnist/FashionMNISTData.kt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import sk.ainet.context.DefaultDataExecutionContext
66
import sk.ainet.context.ExecutionContext
77
import sk.ainet.data.DataBatch
88
import sk.ainet.data.Dataset
9+
import sk.ainet.data.common.DatasetHuggingFaceTokenProvider
910
import sk.ainet.lang.tensor.Shape
1011
import sk.ainet.lang.tensor.Tensor
1112
import sk.ainet.lang.types.DType
@@ -146,7 +147,13 @@ public data class FashionMNISTDataset(
146147
*/
147148
public data class FashionMNISTLoaderConfig(
148149
val cacheDir: String = "fashion-mnist-data",
149-
val useCache: Boolean = true
150+
val useCache: Boolean = true,
151+
val trainImagesUri: String = FashionMNISTConstants.TRAIN_IMAGES_URL,
152+
val trainLabelsUri: String = FashionMNISTConstants.TRAIN_LABELS_URL,
153+
val testImagesUri: String = FashionMNISTConstants.TEST_IMAGES_URL,
154+
val testLabelsUri: String = FashionMNISTConstants.TEST_LABELS_URL,
155+
val huggingFaceTokenProvider: DatasetHuggingFaceTokenProvider? = null,
156+
val useEnvironmentHuggingFaceToken: Boolean = false
150157
)
151158

152159
/**

skainet-data/skainet-data-simple/src/commonMain/kotlin/sk/ainet/data/fashionmnist/FashionMNISTLoaderCommon.kt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ public abstract class FashionMNISTLoaderCommon(public val config: FashionMNISTLo
1616
*/
1717
override suspend fun loadTrainingData(): FashionMNISTDataset {
1818
val imagesBytes = downloadAndCacheFile(
19-
FashionMNISTConstants.TRAIN_IMAGES_URL,
19+
config.trainImagesUri,
2020
FashionMNISTConstants.TRAIN_IMAGES_FILENAME
2121
)
2222
val labelsBytes = downloadAndCacheFile(
23-
FashionMNISTConstants.TRAIN_LABELS_URL,
23+
config.trainLabelsUri,
2424
FashionMNISTConstants.TRAIN_LABELS_FILENAME
2525
)
2626

@@ -34,11 +34,11 @@ public abstract class FashionMNISTLoaderCommon(public val config: FashionMNISTLo
3434
*/
3535
override suspend fun loadTestData(): FashionMNISTDataset {
3636
val imagesBytes = downloadAndCacheFile(
37-
FashionMNISTConstants.TEST_IMAGES_URL,
37+
config.testImagesUri,
3838
FashionMNISTConstants.TEST_IMAGES_FILENAME
3939
)
4040
val labelsBytes = downloadAndCacheFile(
41-
FashionMNISTConstants.TEST_LABELS_URL,
41+
config.testLabelsUri,
4242
FashionMNISTConstants.TEST_LABELS_FILENAME
4343
)
4444

0 commit comments

Comments
 (0)