Skip to content

Commit 56073b6

Browse files
authored
Merge pull request #38 from Paulanerus/dev
Dev
2 parents 26087e7 + 58576c4 commit 56073b6

8 files changed

Lines changed: 283 additions & 45 deletions

File tree

CITATION.cff

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ authors:
1010
repository-code: 'https://github.com/Paulanerus/TextExplorer'
1111
url: 'https://paulee.dev/textvariantexplorer/'
1212
repository-artifact: >-
13-
https://github.com/Paulanerus/TextExplorer/releases/tag/v1.3.2
13+
https://github.com/Paulanerus/TextExplorer/releases/tag/v1.3.3
1414
abstract: >-
1515
A tool designed for the exploration, analysis, and
1616
comparison of textual data variants.
1717
license: GPL-3.0
1818
commit: b1dcc69
19-
version: 1.3.2
20-
date-released: '2025-08-19'
19+
version: 1.3.3
20+
date-released: '2025-08-22'

core/src/main/kotlin/dev/paulee/core/Utility.kt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package dev.paulee.core
22

33
import org.slf4j.LoggerFactory.getLogger
4+
import java.security.MessageDigest
45

56
object GlobalExceptionHandler {
67

@@ -69,3 +70,8 @@ fun splitStr(str: String, delimiter: Char, quoteCharacters: Array<Char> = arrayO
6970
}
7071

7172
fun normalizeDataSource(dataSource: String): String = dataSource.substringBeforeLast(".").replace(" ", "_")
73+
74+
fun sha1Hex(input: String): String =
75+
MessageDigest.getInstance("SHA-1")
76+
.digest(input.toByteArray(Charsets.UTF_8))
77+
.joinToString("") { "%02x".format(it) }

core/src/main/kotlin/dev/paulee/core/data/DataServiceImpl.kt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,7 @@ private class DataPool(val indexer: Indexer, val dataInfo: DataInfo, val storage
161161
indexedValues.add(query)
162162
}
163163

164-
//FIXME: "ids.take(10000).toSet()" is a temporary fix to prevent a silent crash with large result sets, due to bad pagination and will be fixed with v2.
165-
return IndexSearchResult(ids.take(10000).toSet(), token, indexedValues)
164+
return IndexSearchResult(ids, token, indexedValues)
166165
}
167166

168167
fun hasIdentifier(name: String, entries: Map<String, String>): Boolean {

core/src/main/kotlin/dev/paulee/core/data/io/BufferedCSVReader.kt

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,25 @@ import dev.paulee.core.normalizeSourceName
44
import dev.paulee.core.splitStr
55
import org.slf4j.LoggerFactory.getLogger
66
import java.nio.file.Path
7+
import java.util.ArrayList
8+
import java.util.HashMap
79
import kotlin.io.path.bufferedReader
810

9-
internal class BufferedCSVReader(private val path: Path, private val delimiter: Char = ',') {
11+
class BufferedCSVReader(private val path: Path, private val delimiter: Char = ',', val batchSize: Int = 300) {
1012

1113
private val logger = getLogger(BufferedCSVReader::class.java)
1214

13-
private var errorCount: Long = 0
14-
1515
private var reader = path.bufferedReader()
1616

1717
private var headSize: Int = -1
1818

19-
fun readLines(callback: (List<Map<String, String>>) -> Unit) {
20-
val batch = mutableListOf<Map<String, String>>()
19+
fun readLines(callback: (List<HashMap<String, String>>) -> Unit) {
20+
val batch = ArrayList<HashMap<String, String>>(batchSize)
21+
22+
val builder = StringBuilder(1024)
2123

2224
reader.use {
23-
val head = this.readLine()
25+
val head = this.readLine(builder)
2426

2527
if (head == null) {
2628
this.logger.error("Could not read Head ($path).")
@@ -29,61 +31,61 @@ internal class BufferedCSVReader(private val path: Path, private val delimiter:
2931

3032
val header = splitStr(head, delimiter).map { normalizeSourceName(it) }
3133

32-
this.headSize = header.size
34+
this.headSize = countDelimiter(head)
3335

34-
var line: String? = this.readLine()
36+
var line: String? = this.readLine(builder)
3537
while (line != null) {
3638
val split = splitStr(line, delimiter)
3739

3840
if (split.size == this.headSize) {
39-
val headToValue = mutableMapOf<String, String>()
41+
val headToValue = HashMap<String, String>(this.headSize)
4042

4143
split.forEachIndexed { index, entry -> headToValue[header[index]] = entry }
4244

4345
batch.add(headToValue)
44-
} else {
45-
errorCount++
46-
this.logger.warn("Line mismatch (Head: $headSize, Line: ${split.size}, error count: $errorCount): $line")
47-
}
4846

49-
if (batch.size == 100) callback(batch).also { batch.clear() }
47+
if (batch.size == batchSize) {
48+
callback(batch)
49+
batch.clear()
50+
}
51+
}
5052

51-
line = this.readLine()
53+
line = this.readLine(builder)
5254
}
5355

5456
if (batch.isNotEmpty()) callback(batch)
5557
}
5658
}
5759

58-
private fun readLine(): String? {
60+
private fun readLine(builder: StringBuilder): String? {
5961
val line = runCatching { this.reader.readLine() }
6062
.getOrElse { e ->
6163
this.logger.error("Exception: Failed to read line.", e)
6264
null
6365
} ?: return null
6466

65-
if (this.headSize == -1 || (this.getDelimiterCount(line) + 1) == this.headSize) return line
67+
if (this.headSize == -1 || (countDelimiter(line)) == this.headSize) return line
6668

67-
var fullLine = line
69+
builder.clear()
70+
builder.append(line)
6871

69-
while ((this.getDelimiterCount(fullLine) + 1) < this.headSize) {
70-
fullLine += runCatching { this.reader.readLine() }.getOrElse { e ->
71-
this.logger.error("Exception: Failed to read line (while).", e)
72-
null
73-
}?.trim() ?: ""
72+
while ((countDelimiter(builder)) < this.headSize) {
73+
builder.append(runCatching { this.reader.readLine() }.getOrElse { e ->
74+
this.logger.error("Exception: Failed to read line (while).", e); null
75+
} ?: "")
7476
}
7577

76-
return fullLine
78+
return builder.toString()
7779
}
7880

79-
private fun getDelimiterCount(str: String): Int {
80-
var amount = 0
81+
private fun countDelimiter(sequence: CharSequence): Int {
82+
var amount = 1
8183

8284
var insideQuotes = false
83-
str.forEach {
84-
if (it == '"') insideQuotes = !insideQuotes
85+
for (char in sequence) {
86+
if (char == '"') insideQuotes = !insideQuotes
8587

86-
if (it == delimiter && !insideQuotes) amount++
88+
if (char == delimiter && !insideQuotes) amount++
8789
}
8890

8991
return amount

core/src/main/kotlin/dev/paulee/core/data/sql/Database.kt

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@ import dev.paulee.api.data.Source
55
import dev.paulee.api.data.UniqueField
66
import dev.paulee.api.data.provider.QueryOrder
77
import dev.paulee.core.normalizeDataSource
8+
import dev.paulee.core.sha1Hex
89
import org.slf4j.LoggerFactory.getLogger
910
import java.io.Closeable
1011
import java.nio.file.Path
1112
import java.sql.Connection
1213
import java.sql.DriverManager
14+
import java.sql.Types
1315
import kotlin.io.path.createDirectories
1416

1517
private enum class ColumnType {
@@ -40,6 +42,8 @@ private class Table(val name: String, columns: List<Column>) {
4042

4143
val columns = listOf(primaryKey) + columns.filter { !it.primary }
4244

45+
val tempTables = mutableMapOf<String, String>()
46+
4347
fun createIfNotExists(connection: Connection) {
4448
connection.createStatement().use {
4549
it.execute("CREATE TABLE IF NOT EXISTS $name (${columns.joinToString(", ")})")
@@ -86,7 +90,7 @@ private class Table(val name: String, columns: List<Column>) {
8690
append("SELECT * FROM ")
8791
append(name)
8892

89-
append(buildWhereClause(whereClause))
93+
append(buildWhereClause(connection, whereClause))
9094

9195
order?.takeIf { it.first.isNotBlank() }?.let {
9296
append(" ORDER BY ")
@@ -122,7 +126,7 @@ private class Table(val name: String, columns: List<Column>) {
122126
append("SELECT COUNT(*) FROM ")
123127
append(name)
124128

125-
append(buildWhereClause(whereClause))
129+
append(buildWhereClause(connection, whereClause))
126130
}
127131

128132
return connection.createStatement().use { statement ->
@@ -162,7 +166,7 @@ private class Table(val name: String, columns: List<Column>) {
162166

163167
override fun toString(): String = "$name primary=${primaryKey}, columns={${columns.joinToString(", ")}}"
164168

165-
private fun buildWhereClause(whereClause: Map<String, List<String>>): String {
169+
private fun buildWhereClause(connection: Connection, whereClause: Map<String, List<String>>): String {
166170
if (whereClause.isEmpty()) return ""
167171

168172
val parts =
@@ -178,11 +182,22 @@ private class Table(val name: String, columns: List<Column>) {
178182

179183
append("$column = ${if (columnType == ColumnType.TEXT) "'${value.escapeLiteral()}'" else value}")
180184
} else {
181-
val inClause = nonWildcards.joinToString(
182-
", ", prefix = "IN (", postfix = ")"
183-
) { if (columnType == ColumnType.TEXT) "'${it.escapeLiteral()}'" else it }
184185

185-
append("$column $inClause")
186+
if(nonWildcards.size > 500){
187+
val hash = sha1Hex(nonWildcards.joinToString(""))
188+
189+
val tempQuery = tempTables.getOrPut(hash) {
190+
createAndUpdateTempTable(connection, hash, nonWildcards, columnType)
191+
}
192+
193+
append("$column IN ($tempQuery)")
194+
}else{
195+
val inClause = nonWildcards.joinToString(
196+
", ", prefix = "IN (", postfix = ")"
197+
) { if (columnType == ColumnType.TEXT) "'${it.escapeLiteral()}'" else it }
198+
199+
append("$column $inClause")
200+
}
186201
}
187202

188203
if (wildcards.isNotEmpty()) append(" OR ")
@@ -212,6 +227,48 @@ private class Table(val name: String, columns: List<Column>) {
212227
.replace("*", "%")
213228
.replace("?", "_")
214229
.escapeLiteral()
230+
231+
private fun createAndUpdateTempTable(connection: Connection, hash: String, values: List<String>, type: ColumnType): String {
232+
val tempName = "tmp_$hash"
233+
234+
connection.createStatement().use {
235+
it.execute("CREATE TEMP TABLE $tempName (v $type)")
236+
}
237+
238+
values.chunked(500).forEach { chunk ->
239+
val placeholders = List(chunk.size) { "(?)" }.joinToString(", ")
240+
val sql = "INSERT INTO $tempName(v) VALUES $placeholders"
241+
242+
connection.prepareStatement(sql).use { ps ->
243+
var paramIndex = 1
244+
245+
when (type) {
246+
ColumnType.TEXT -> {
247+
for (v in chunk)
248+
ps.setString(paramIndex++, v)
249+
}
250+
ColumnType.INTEGER -> {
251+
for (v in chunk) {
252+
val longVal = v.toLongOrNull()
253+
if (longVal == null) ps.setNull(paramIndex++, Types.INTEGER)
254+
else ps.setLong(paramIndex++, longVal)
255+
}
256+
}
257+
ColumnType.REAL -> {
258+
for (v in chunk) {
259+
val dblVal = v.toDoubleOrNull()
260+
if (dblVal == null) ps.setNull(paramIndex++, Types.REAL)
261+
else ps.setDouble(paramIndex++, dblVal)
262+
}
263+
}
264+
}
265+
266+
ps.executeUpdate()
267+
}
268+
}
269+
270+
return "SELECT v FROM $tempName"
271+
}
215272
}
216273

217274
internal class Database(path: Path) : Closeable {

docs/source/content/installation.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ You can always obtain the latest release from the following link:
1111

1212
## Example Data Set
1313

14-
An example data set is available for download at [Zenodo](https://zenodo.org/records/12723324).
15-
This is Version 3 of the set titled *A Corpus of Biblical Names in the Greek New Testament to Study the Additions, Omissions, and Variations across Different Manuscripts*.
14+
Download the example data set from [Zenodo](https://zenodo.org/records/15789063). It corresponds to Version 4 of the dataset "A Corpus of Biblical Names in the Greek New Testament to Study the Additions, Omissions, and Variations across Different Manuscripts." To import it, use `GreekVariant4.json`, available on [GitHub](https://github.com/Paulanerus/TextExplorer/blob/master/example/GreekVariant4.json) (for more information, see [Data Import](usage.md)).
1615

1716
Additionally, a plugin for this data set can be downloaded from the [official release page](https://github.com/Paulanerus/TextExplorer/releases/latest) (demo.jar).
1817
Both plugins and data sets can be loaded directly within the application, as documented [here](usage.md).

0 commit comments

Comments
 (0)