Skip to content

Commit 3ef5831

Browse files
authored
Merge pull request #48 from Paulanerus/dev
Dev
2 parents 8ae1ee7 + 1ffb851 commit 3ef5831

28 files changed

Lines changed: 548 additions & 401 deletions

File tree

api/src/main/kotlin/dev/paulee/api/data/Data.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,4 @@ data class Source(
128128
val preFilter: PreFilter? = null,
129129
)
130130

131-
data class DataInfo(val name: String, val sources: List<Source>, val storageType: StorageType = StorageType.SQLITE)
131+
data class DataInfo(val name: String, val sources: List<Source>, val storageType: StorageType = StorageType.Default)

api/src/main/kotlin/dev/paulee/api/data/provider/StorageProvider.kt

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,26 @@ package dev.paulee.api.data.provider
33
import dev.paulee.api.data.DataInfo
44
import java.io.Closeable
55
import java.nio.file.Path
6+
import java.util.LinkedHashMap
67

78
typealias QueryOrder = Pair<String, Boolean>
89

910
enum class StorageType {
11+
Default,
12+
13+
@Deprecated("SQLITE support has been removed. Kept only to detect legacy data pools; use StorageType.Default.")
1014
SQLITE,
1115
}
1216

1317
enum class ProviderStatus {
14-
SUCCESS,
15-
FAILED,
16-
EXISTS,
18+
Success,
19+
Failed,
20+
Exists,
1721
}
1822

1923
interface IStorageProvider : Closeable {
2024

21-
fun init(dataInfo: DataInfo, path: Path, lock: Boolean = false): ProviderStatus
22-
23-
fun insert(name: String, entries: List<Map<String, String>>)
25+
fun init(dataInfo: DataInfo, path: Path): ProviderStatus
2426

2527
fun get(
2628
name: String,
@@ -40,4 +42,6 @@ interface IStorageProvider : Closeable {
4042
): Long
4143

4244
fun suggestions(name: String, field: String, value: String, amount: Int): List<String>
45+
46+
fun streamData(name: String): Sequence<LinkedHashMap<String, String>>
4347
}

api/src/main/kotlin/dev/paulee/api/internal/Embedding.kt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,16 @@ object Embedding {
1111
) {
1212
EmbeddingGemma(
1313
"onnx-community/embeddinggemma-300m-ONNX",
14-
"A lightweight open embedding model from Google, built on Gemma 3 and trained on 100+ spoken languages.",
14+
"model_management.embedding_gemma.desc",
1515
"Google DeepMind",
1616
"300M",
1717
"https://huggingface.co/google/embeddinggemma-300m",
1818
ModelData()
1919
),
2020
AncientGreekBert(
2121
"onnx-community/Ancient-Greek-BERT-ONNX",
22-
"A BERT model specialized for Greek and Ancient Greek texts.",
23-
"Pranaydeep Singh, Gorik Rutten and Els Lefever",
22+
"model_management.ancient_greek_bert.desc",
23+
"Pranaydeep Singh, Gorik Rutten, Els Lefever",
2424
"110M",
2525
"https://huggingface.co/pranaydeeps/Ancient-Greek-BERT",
2626
ModelData(

build.gradle.kts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ compose.desktop {
6464
}
6565

6666
macOS {
67-
// iconFile.set(project.file("ui/src/main/resources/icon.icns"))
67+
iconFile.set(project.file("ui/src/main/resources/icon.icns"))
6868

6969
appCategory = "public.app-category.utilities"
7070
}

core/build.gradle.kts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies {
1818

1919
implementation(kotlin("reflect"))
2020

21-
implementation("org.xerial:sqlite-jdbc:${rootProject.extra["sqlite-jdbc.version"]}")
21+
implementation("org.duckdb:duckdb_jdbc:${rootProject.extra["duckdb.version"]}")
2222

2323
implementation("io.github.java-diff-utils:java-diff-utils:${rootProject.extra["jdu.version"]}")
2424

@@ -45,6 +45,8 @@ dependencies {
4545
if(os.isMacOsX) implementation("com.microsoft.onnxruntime:onnxruntime:${rootProject.extra["onnx.version"]}")
4646
else implementation("com.microsoft.onnxruntime:onnxruntime_gpu:${rootProject.extra["onnx.version"]}")
4747

48+
implementation("com.github.ben-manes.caffeine:caffeine:${rootProject.extra["caffeine.version"]}")
49+
4850
testImplementation(kotlin("test"))
4951
}
5052

core/src/main/kotlin/dev/paulee/core/data/DataServiceImpl.kt

Lines changed: 44 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
package dev.paulee.core.data
22

3-
import dev.paulee.api.data.DataInfo
4-
import dev.paulee.api.data.IDataService
5-
import dev.paulee.api.data.PreFilter
6-
import dev.paulee.api.data.VariantMapping
3+
import com.github.benmanes.caffeine.cache.Cache
4+
import com.github.benmanes.caffeine.cache.Caffeine
5+
import dev.paulee.api.data.*
76
import dev.paulee.api.data.provider.IStorageProvider
87
import dev.paulee.api.data.provider.ProviderStatus
98
import dev.paulee.api.data.provider.QueryOrder
109
import dev.paulee.api.internal.Embedding
1110
import dev.paulee.core.data.analysis.Indexer
12-
import dev.paulee.core.data.io.BufferedCSVReader
1311
import dev.paulee.core.data.model.DataPool
1412
import dev.paulee.core.data.provider.EmbeddingProvider
1513
import dev.paulee.core.data.provider.StorageProvider
@@ -19,19 +17,22 @@ import kotlinx.coroutines.withContext
1917
import org.slf4j.LoggerFactory.getLogger
2018
import java.io.IOException
2119
import java.nio.file.Path
20+
import java.time.Duration
2221
import kotlin.coroutines.cancellation.CancellationException
2322
import kotlin.io.path.*
2423
import kotlin.math.ceil
2524

25+
typealias QueryKey = Triple<Int, String, QueryOrder?>
26+
2627
typealias PageResult = Pair<List<Map<String, String>>, Map<String, List<Map<String, String>>>>
2728

2829
object DataServiceImpl : IDataService {
2930

3031
private val logger = getLogger(DataServiceImpl::class.java)
3132

32-
private const val PAGE_SIZE = 50
33+
private const val PAGE_SIZE = 100
3334

34-
private const val CSV_READER_BATCH_SIZE = 300
35+
private const val BATCH_SIZE = 1000
3536

3637
private val variantPattern = Regex("@([^:]+):(\\S+)")
3738

@@ -41,11 +42,10 @@ object DataServiceImpl : IDataService {
4142

4243
private var currentField: String? = null
4344

44-
private val pageCache = object : LinkedHashMap<Triple<Int, String, QueryOrder?>, PageResult>(6, 0.75f, true) {
45-
override fun removeEldestEntry(eldest: MutableMap.MutableEntry<Triple<Int, String, QueryOrder?>, PageResult>?): Boolean {
46-
return size > 6
47-
}
48-
}
45+
private val pageCache: Cache<QueryKey, PageResult> = Caffeine.newBuilder()
46+
.maximumSize(32)
47+
.expireAfterAccess(Duration.ofDays(3))
48+
.build()
4949

5050
private val storageProvider = mutableMapOf<String, IStorageProvider>()
5151

@@ -60,11 +60,13 @@ object DataServiceImpl : IDataService {
6060
try {
6161
val poolPath = FileService.dataDir.resolve(dataInfo.name)
6262

63-
val storageProvider = StorageProvider.of(dataInfo.storageType)
63+
val currentProvider = StorageProvider.of(dataInfo.storageType)
6464

65-
val initStatus = storageProvider.init(dataInfo, poolPath)
66-
67-
if (initStatus != ProviderStatus.SUCCESS) return@withContext initStatus == ProviderStatus.EXISTS
65+
when (currentProvider.init(dataInfo, poolPath)) {
66+
ProviderStatus.Success -> Unit
67+
ProviderStatus.Exists -> return@withContext true
68+
else -> return@withContext false
69+
}
6870

6971
dataInfoToString(dataInfo)?.let { json ->
7072
poolPath.resolve("info.json").writeText(json)
@@ -75,62 +77,45 @@ object DataServiceImpl : IDataService {
7577
DataPool(
7678
indexer = Indexer(poolPath.resolve("index"), dataInfo),
7779
dataInfo = dataInfo,
78-
storageProvider = storageProvider
80+
storageProvider = currentProvider
7981
)
8082
}.getOrElse { e ->
8183
logger.error("Exception: Failed to create data pool.", e)
8284
return@withContext false
8385
}
8486

85-
val totalBatches = dataInfo.sources.sumOf { source ->
86-
val sourcePath =
87-
FileService.dataDir.resolve(source.name.let { if (it.endsWith(".csv")) it else "$it.csv" })
87+
val sourcesWithIndex = dataInfo.sources.filter { it.fields.any { field -> field is IndexField } }
8888

89-
if (sourcePath.exists()) BufferedCSVReader.estimateBatches(sourcePath, CSV_READER_BATCH_SIZE)
90-
else 0
91-
}
89+
val totalBatches =
90+
((sourcesWithIndex.sumOf { currentProvider.count(it.name) } + BATCH_SIZE - 1) / BATCH_SIZE).toInt()
9291

9392
var processedBatches = 0
9493
var lastPercentage = 0
9594
onProgress(0)
9695

97-
dataInfo.sources.forEach { source ->
98-
val file = source.name
99-
100-
if (file.isEmpty()) {
101-
logger.warn("No data source provided for ${file}.")
102-
return@forEach
103-
}
96+
sourcesWithIndex
97+
.forEach { source ->
98+
val name = source.name
10499

105-
val sourcePath =
106-
FileService.dataDir.resolve(file.let { if (it.endsWith(".csv")) it else "$it.csv" })
100+
currentProvider.streamData(name)
101+
.chunked(BATCH_SIZE)
102+
.forEach { entries ->
103+
dataPool.indexer.indexEntries(name, entries)
107104

108-
if (!sourcePath.exists()) {
109-
logger.warn("Source file '$sourcePath' not found.")
110-
return@forEach
111-
}
105+
processedBatches++
112106

113-
val idGenerator = generateSequence(1L) { it + 1 }.iterator()
107+
val percentage =
108+
(if (totalBatches > 0) (processedBatches * 100) / totalBatches else 0)
114109

115-
BufferedCSVReader(sourcePath, batchSize = CSV_READER_BATCH_SIZE).readLines { lines ->
116-
val entries = lines.map { line ->
117-
if (dataPool.hasIdentifier(file, line)) line
118-
else line + ("${file}_ag_id" to idGenerator.next().toString())
119-
}
120-
121-
dataPool.indexer.indexEntries(file, entries)
122-
storageProvider.insert(file, entries)
123-
124-
processedBatches++
125-
126-
val percentage = if (totalBatches > 0) (processedBatches * 100) / totalBatches else 0
127-
128-
if (percentage > lastPercentage) {
129-
onProgress(percentage)
130-
lastPercentage = percentage
131-
}
110+
if (percentage > lastPercentage) {
111+
onProgress(percentage)
112+
lastPercentage = percentage
113+
}
114+
}
132115
}
133-
}
116+
117+
dataPool.indexer.finish()
118+
EmbeddingProvider.finish()
134119

135120
dataPools[dataInfo.name] = dataPool
136121

@@ -170,7 +155,7 @@ object DataServiceImpl : IDataService {
170155

171156
val storageProvider = StorageProvider.of(dataInfo.storageType)
172157

173-
if (storageProvider.init(dataInfo, child) == ProviderStatus.EXISTS) {
158+
if (storageProvider.init(dataInfo, child) == ProviderStatus.Exists) {
174159
dataPools[dataInfo.name] =
175160
DataPool(Indexer(child.resolve("index"), dataInfo), dataInfo, storageProvider)
176161

@@ -247,7 +232,7 @@ object DataServiceImpl : IDataService {
247232

248233
val key = Triple(pageCount, query, order)
249234

250-
pageCache[key]?.let { return it }
235+
pageCache.getIfPresent(key)?.let { return it }
251236

252237
val dataPool = this.dataPools[this.currentPool] ?: return Pair(emptyList(), emptyMap())
253238

@@ -284,7 +269,7 @@ object DataServiceImpl : IDataService {
284269

285270
val result = PageResult(entries, links)
286271

287-
pageCache[key] = result
272+
pageCache.put(key, result)
288273

289274
return result
290275
}
@@ -323,7 +308,7 @@ object DataServiceImpl : IDataService {
323308
return null
324309
}
325310

326-
provider.init(dataInfo, path, true)
311+
provider.init(dataInfo, path)
327312

328313
return provider
329314
}

core/src/main/kotlin/dev/paulee/core/data/FileService.kt

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,31 @@ internal object FileService {
3737
}
3838
}
3939

40+
val isCuda12xInstalled: Boolean by lazy {
41+
if (OperatingSystem.isMacOS) return@lazy false
42+
43+
runCatching {
44+
val process = ProcessBuilder().apply {
45+
command("nvcc", "--version")
46+
redirectErrorStream(true)
47+
}.start()
48+
49+
val output = process.inputStream.bufferedReader().use { it.readText() }
50+
process.waitFor()
51+
52+
val versionRegex = Regex("""V(\d+)\.(\d+)\.(\d+)""")
53+
val matchResult = versionRegex.find(output)
54+
55+
if (matchResult != null) {
56+
val majorVersion = matchResult.groupValues[1].toIntOrNull() ?: 0
57+
58+
majorVersion == 12
59+
} else {
60+
false
61+
}
62+
}.getOrDefault(false)
63+
}
64+
4065
val appDir: Path get() = ensureDir(".textexplorer", true)
4166

4267
val pluginsDir: Path get() = ensureDir("plugins")
@@ -50,7 +75,7 @@ internal object FileService {
5075
private val mapper = jacksonObjectMapper().apply { enable(SerializationFeature.INDENT_OUTPUT) }
5176

5277
init {
53-
logger.info("Operating system: ${OperatingSystem.current}")
78+
logger.info("Operating system: ${OperatingSystem.current} | CUDA: $isCuda12xInstalled")
5479
}
5580

5681
fun toJson(dataInfo: DataInfo): String? = runCatching { this.mapper.writeValueAsString(dataInfo) }

0 commit comments

Comments
 (0)