Skip to content

Commit 9342fc5

Browse files
authored
Merge pull request #52 from Paulanerus/dev
Dev
2 parents 8f70b7f + 06c919c commit 9342fc5

17 files changed

Lines changed: 629 additions & 63 deletions

File tree

api/src/main/kotlin/dev/paulee/api/data/IDataService.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ interface IDataService : Closeable {
2828

2929
suspend fun downloadModel(model: Embedding.Model, path: Path, onProgress: (progress: Int) -> Unit)
3030

31+
suspend fun importPool(path: Path): Boolean
32+
33+
suspend fun exportPool(dataInfo: DataInfo, path: Path): Boolean
34+
3135
fun getPage(
3236
query: String,
3337
isSemantic: Boolean,

api/src/main/kotlin/dev/paulee/api/internal/Embedding.kt

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ object Embedding {
1515
"Google DeepMind",
1616
"300M",
1717
"https://huggingface.co/google/embeddinggemma-300m",
18-
ModelData()
18+
ModelData(
19+
modelData = "onnx/model.onnx_data"
20+
)
1921
),
2022
AncientGreekBert(
2123
"onnx-community/Ancient-Greek-BERT-ONNX",
@@ -25,7 +27,16 @@ object Embedding {
2527
"https://huggingface.co/pranaydeeps/Ancient-Greek-BERT",
2628
ModelData(
2729
maxLength = 512,
28-
modelData = ""
30+
)
31+
),
32+
GreekTransfer(
33+
"onnx-community/stsb-xlm-r-greek-transfer-ONNX",
34+
"model_management.greek_transfer.desc",
35+
"lighteternal",
36+
"270M",
37+
"https://huggingface.co/lighteternal/stsb-xlm-r-greek-transfer",
38+
ModelData(
39+
maxLength = 400,
2940
)
3041
)
3142
}
@@ -34,7 +45,7 @@ object Embedding {
3445
val dimension: Int = 768,
3546
val maxLength: Int = 2048,
3647
val model: String = "onnx/model.onnx",
37-
val modelData: String = "onnx/model.onnx_data",
48+
val modelData: String? = null,
3849
val tokenizer: String = "tokenizer.json",
3950
val tokenizerConfig: String = "tokenizer_config.json",
4051
)

core/src/main/kotlin/dev/paulee/core/data/DataServiceImpl.kt

Lines changed: 222 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,46 @@ object DataServiceImpl : IDataService {
237237
val dataInfo = FileService.fromJson(runCatching { jsonFile.readText() }.getOrDefault(""))
238238
?: return@forEachDirectoryEntry
239239

240+
val dataPath = child.resolve("data")
241+
242+
if (dataPath.notExists()) {
243+
logger.warn("Data pool '${child.name}' has no data directory. Migrating files...")
244+
245+
runCatching { dataPath.createDirectories() }
246+
.onFailure {
247+
logger.error("Failed to create data directory for ${child.name}.", it)
248+
return@forEachDirectoryEntry
249+
}
250+
251+
var failed = false
252+
dataInfo.sources.forEach { source ->
253+
val name = source.name
254+
255+
val sourcePath = FileService.dataDir.resolve("$name.csv")
256+
257+
if (sourcePath.notExists()) {
258+
logger.warn("Data pool '${child.name}' has no source file '$name'. Skipping.")
259+
failed = true
260+
return@forEach
261+
}
262+
263+
val targetPath = dataPath.resolve("$name.csv")
264+
265+
runCatching { sourcePath.moveTo(targetPath, overwrite = true) }
266+
.onFailure {
267+
logger.error("Failed to migrate file '$name'", it)
268+
failed = true
269+
}
270+
}
271+
272+
if (failed) {
273+
logger.error("Data pool '${child.name}' has failed to migrate files. Skipping.")
274+
return@forEachDirectoryEntry
275+
}
276+
277+
logger.info("Migrated data pool '${child.name}' files.")
278+
}
279+
240280
val infoName = dataInfo.name
241281

242282
if (dataInfo.storageType == StorageType.SQLITE) {
@@ -255,10 +295,7 @@ object DataServiceImpl : IDataService {
255295

256296
logger.info("Loaded $infoName data pool.")
257297
} else {
258-
logger.info("Deleting invalid or empty data pool directory '$infoName'.")
259-
260-
runCatching { child.deleteRecursively() }
261-
.onFailure { e -> logger.error("Failed to delete directory ${child.fileName}.", e) }
298+
logger.warn("Data pool '$infoName' has invalid storage provider.")
262299
}
263300
}
264301

@@ -319,6 +356,187 @@ object DataServiceImpl : IDataService {
319356
override suspend fun downloadModel(model: Embedding.Model, path: Path, onProgress: (progress: Int) -> Unit) =
320357
EmbeddingProvider.downloadModel(model, path, onProgress)
321358

359+
@OptIn(ExperimentalPathApi::class)
360+
override suspend fun importPool(path: Path): Boolean {
361+
if (path.extension != "zip") {
362+
logger.warn("File is not a zip file.")
363+
return false
364+
}
365+
366+
logger.info("Importing pool from file '$path'.")
367+
368+
val pathOut = FileService.tempDir.resolve(path.nameWithoutExtension)
369+
370+
val resultUnzip = withContext(Dispatchers.IO) { ZipService.unzip(path, pathOut) }
371+
372+
if (!resultUnzip) {
373+
logger.warn("Failed to unzip pool.")
374+
return false
375+
}
376+
377+
fun deleteIfExists() {
378+
if (pathOut.exists()) pathOut.deleteRecursively()
379+
}
380+
381+
val infoFile = pathOut.resolve("info.json")
382+
383+
if (infoFile.notExists() || !infoFile.isRegularFile()) {
384+
logger.warn("Pool info file does not exist.")
385+
386+
deleteIfExists()
387+
388+
return false
389+
}
390+
391+
val dataInfo = dataInfoFromString(infoFile.readText())
392+
393+
if (dataInfo == null) {
394+
logger.warn("Pool info file is invalid.")
395+
396+
deleteIfExists()
397+
398+
return false
399+
}
400+
401+
val destinationDir = FileService.dataDir.resolve(dataInfo.name)
402+
403+
if (destinationDir.exists()) {
404+
logger.warn("Pool directory already exists.")
405+
406+
deleteIfExists()
407+
408+
return false
409+
}
410+
411+
runCatching { destinationDir.createDirectories() }
412+
.onFailure {
413+
logger.error("Failed to create pool directory.", it)
414+
deleteIfExists()
415+
416+
return false
417+
}
418+
419+
val indexPath = pathOut.resolve("index")
420+
421+
if (indexPath.notExists() || !indexPath.isDirectory()) {
422+
logger.warn("Pool index directory does not exist.")
423+
424+
deleteIfExists()
425+
426+
return false
427+
}
428+
429+
val (resultChecker, report) = Indexer.checkIndex(indexPath)
430+
431+
if (!resultChecker) {
432+
logger.error("Pool index is invalid: $report")
433+
434+
deleteIfExists()
435+
436+
return false
437+
} else logger.info("Pool index is valid.")
438+
439+
val storageDir = pathOut.resolve("data")
440+
441+
if (storageDir.notExists() || !storageDir.isDirectory()) {
442+
logger.warn("Pool storage directory does not exist.")
443+
444+
deleteIfExists()
445+
446+
return false
447+
}
448+
449+
runCatching { storageDir.moveTo(destinationDir.resolve("data")) }
450+
.onFailure {
451+
logger.error("Failed to move pool data directory.", it)
452+
453+
deleteIfExists()
454+
455+
return false
456+
}
457+
458+
runCatching { infoFile.moveTo(destinationDir.resolve("info.json")) }
459+
.onFailure {
460+
logger.error("Failed to move pool info file.", it)
461+
462+
deleteIfExists()
463+
464+
return false
465+
}
466+
467+
runCatching { indexPath.moveTo(destinationDir.resolve("index")) }
468+
.onFailure {
469+
logger.error("Failed to move pool index directory.", it)
470+
471+
deleteIfExists()
472+
473+
return false
474+
}
475+
476+
logger.info("Initializing storage provider for pool '${dataInfo.name}'...")
477+
478+
val storageProvider = StorageProvider.of(dataInfo.storageType)
479+
480+
val status = storageProvider.init(dataInfo, destinationDir)
481+
482+
if (status != ProviderStatus.Success) {
483+
logger.warn("Failed to initialize storage provider for pool '${dataInfo.name}'.")
484+
return false
485+
}
486+
487+
logger.info("Initializing pool...")
488+
489+
dataPools[dataInfo.name] =
490+
runCatching { DataPool(Indexer(destinationDir.resolve("index"), dataInfo), dataInfo, storageProvider) }
491+
.getOrElse {
492+
logger.error("Failed to initialize pool.", it)
493+
return false
494+
}
495+
496+
logger.info("Successfully imported pool '${dataInfo.name}'.")
497+
498+
deleteIfExists()
499+
500+
return true
501+
}
502+
503+
@OptIn(ExperimentalPathApi::class)
504+
override suspend fun exportPool(dataInfo: DataInfo, path: Path): Boolean {
505+
logger.info("Exporting pool '${dataInfo.name}' to '$path'.")
506+
507+
val infoName = dataInfo.name
508+
509+
val poolPath = FileService.dataDir.resolve(infoName)
510+
511+
if (poolPath.notExists()) {
512+
logger.warn("Pool path does not exist.")
513+
return false
514+
}
515+
516+
logger.info("Packaging pool files for '$infoName'.")
517+
518+
val pathOut = path.resolve("$infoName.zip")
519+
520+
val result = withContext(Dispatchers.IO) {
521+
val resultZip = ZipService.zip(
522+
setOf(
523+
poolPath.resolve("index"),
524+
poolPath.resolve("data"),
525+
poolPath.resolve("info.json")
526+
),
527+
pathOut,
528+
filesToExclude = setOf("write.lock")
529+
)
530+
531+
resultZip
532+
}
533+
534+
if (result) logger.info("Exported pool '$infoName' to '$pathOut'.")
535+
else logger.error("Failed to export pool.")
536+
537+
return result
538+
}
539+
322540
override fun getPage(query: String, isSemantic: Boolean, order: QueryOrder?, pageCount: Int): PageResult {
323541

324542
if (this.currentPool == null || this.currentField == null) return Pair(emptyList(), emptyMap())

core/src/main/kotlin/dev/paulee/core/data/FileService.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ internal object FileService {
8686

8787
val modelsDir: Path get() = ensureDir("models")
8888

89+
val tempDir: Path get() = ensureDir("temp")
90+
8991
private val logger = getLogger(FileService::class.java)
9092

9193
private val mapper = jacksonObjectMapper().apply { enable(SerializationFeature.INDENT_OUTPUT) }

0 commit comments

Comments
 (0)