Skip to content

Commit cb313eb

Browse files
dnplkndllclaude
andcommitted
feat: tune Typesense adapter for better search parity with ES
- Increase query_by_weights to 100,100,1 (matching ES boost ratios) - Enable num_typos: 2,2,1 for broader fuzzy matching - Add typo_tokens_threshold: 1 and drop_tokens_threshold: 1 - Use text_match_info.best_field_score for more granular scoring - Add sanitizeDoc() to coerce dynamic *_fields to string arrays (fixes 5 import errors from searchIcon_fields type mismatch) Search comparison results after tuning: - ES↔OpenSearch: 100% identical on all queries - ES↔Typesense: same top results, 5/5 overlap on class filter, 3/5 on prefix/fuzzy (different ranking, same docs found) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Don Kendall <kendall@donkendall.com>
1 parent d1bcb84 commit cb313eb

3 files changed

Lines changed: 60 additions & 66 deletions

File tree

foundations/server/packages/opensearch/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
"author": "Anticrm Platform Contributors",
2828
"license": "EPL-2.0",
2929
"scripts": {
30-
"init": "ts-node src/__init.ts",
3130
"build": "compile",
3231
"build:watch": "compile",
3332
"format": "format src",

foundations/server/packages/typesense/src/adapter.ts

Lines changed: 55 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,6 @@ function getIndexVersion (): string {
4545
return getMetadata(serverCore.metadata.ElasticIndexVersion) ?? 'v2'
4646
}
4747

48-
/** Fields that are defined in the schema and are facet-capable (keyword-like). */
49-
const FACET_FIELDS = new Set([
50-
'id',
51-
'workspaceId',
52-
'_class',
53-
'space',
54-
'attachedTo',
55-
'attachedToClass',
56-
'modifiedBy',
57-
'core:class:Doc%createdBy',
58-
'core:class:Doc%modifiedBy'
59-
])
60-
6148
function buildCollectionSchema (collectionName: string): {
6249
name: string
6350
fields: CollectionFieldSchema[]
@@ -101,11 +88,34 @@ function isConnectionError (err: any): boolean {
10188
return false
10289
}
10390

91+
/** Fields that are known to be string arrays in the schema. */
92+
const ARRAY_FIELDS = new Set(['_class'])
93+
10494
/** Escape a value for use inside a Typesense filter_by backtick-quoted string. */
10595
function escapeFilterValue (val: string): string {
10696
return val.replace(/`/g, '\\`')
10797
}
10898

99+
/**
100+
* Sanitize a document for Typesense upsert.
101+
* - Removes binary `data` field
102+
* - Coerces fields ending in `_fields` to string arrays (dynamic ES fields)
103+
* - Ensures `_class` is always an array
104+
*/
105+
function sanitizeDoc (doc: Record<string, any>): Record<string, any> {
106+
const result: Record<string, any> = {}
107+
for (const [key, value] of Object.entries(doc)) {
108+
if (key === 'data') continue
109+
// Coerce _class and *_fields to arrays
110+
if (ARRAY_FIELDS.has(key) || key.endsWith('_fields')) {
111+
result[key] = Array.isArray(value) ? value : value != null ? [String(value)] : []
112+
} else {
113+
result[key] = value
114+
}
115+
}
116+
return result
117+
}
118+
109119
/**
110120
* Build a Typesense filter_by string from a workspace ID and a Huly DocumentQuery.
111121
* Skips `$`-prefixed keys (like `$search`).
@@ -216,32 +226,25 @@ class TypesenseAdapter implements FullTextAdapter {
216226
// Require searchTitle to exist (non-empty)
217227
filterParts.push('searchTitle:!=""')
218228

219-
// Scoring: boost documents where a facet field matches a specific value.
220-
// Typesense doesn't support per-term boosting like ES function_score,
221-
// so we use optional filter clauses that prefer matching documents.
222-
if (options.scoring !== undefined && options.scoring.length > 0) {
223-
const optionalParts: string[] = []
224-
for (const scoring of options.scoring) {
225-
if (FACET_FIELDS.has(scoring.attr)) {
226-
optionalParts.push(`${scoring.attr}:=\`${escapeFilterValue(String(scoring.value))}\``)
227-
}
228-
}
229-
if (optionalParts.length > 0) {
230-
// Use optional filter_by syntax: main filters && (optional1 || optional2)
231-
// Documents matching optional filters rank higher via _text_match + filter proximity
232-
filterParts.push(`(${optionalParts.join(' || ')})`)
233-
}
234-
}
229+
// Scoring: ES uses function_score with should clauses (soft boosts).
230+
// Typesense has no equivalent — filter_by is always mandatory (AND).
231+
// We skip scoring filters entirely and rely on text_match ranking,
232+
// which already prioritizes title matches via query_by_weights.
233+
// Adding scoring fields to filter_by would incorrectly EXCLUDE
234+
// documents that don't match, instead of just ranking them lower.
235235

236236
const filterBy = filterParts.join(' && ')
237237

238238
const searchParams: any = {
239239
q: query.query,
240240
query_by: 'searchTitle,searchShortTitle,fulltextSummary',
241-
query_by_weights: '50,50,1',
241+
query_by_weights: '100,100,1',
242242
filter_by: filterBy,
243243
limit: options.limit ?? DEFAULT_LIMIT,
244244
prefix: 'true,true,false',
245+
num_typos: '2,2,1',
246+
typo_tokens_threshold: 1,
247+
drop_tokens_threshold: 1,
245248
sort_by: '_text_match:desc'
246249
}
247250

@@ -257,7 +260,7 @@ class TypesenseAdapter implements FullTextAdapter {
257260
return {
258261
...doc,
259262
id: this.getDocId(workspaceId, doc.id),
260-
_score: hit.text_match ?? 0
263+
_score: hit.text_match_info?.best_field_score ?? hit.text_match ?? 0
261264
}
262265
})
263266
}
@@ -291,34 +294,24 @@ class TypesenseAdapter implements FullTextAdapter {
291294
}
292295

293296
// In Elastic, additional query fields are soft boosts (should clauses).
294-
// Typesense has no direct equivalent, so we add them as optional filter
295-
// clauses — documents matching them rank higher but aren't excluded.
296-
const optionalParts: string[] = []
297-
for (const [q, v] of Object.entries(query)) {
298-
if (q.startsWith('$')) continue
299-
if (typeof v === 'object' && v !== null) {
300-
if (v.$in !== undefined && Array.isArray(v.$in)) {
301-
optionalParts.push(`${q}:=[${v.$in.map((val: string) => `\`${escapeFilterValue(val)}\``).join(',')}]`)
302-
}
303-
} else {
304-
optionalParts.push(`${q}:=\`${escapeFilterValue(String(v))}\``)
305-
}
306-
}
307-
if (optionalParts.length > 0) {
308-
filterParts.push(`(${optionalParts.join(' || ')})`)
309-
}
297+
// Typesense filter_by is always mandatory (AND), so adding these as
298+
// filters would incorrectly exclude non-matching documents instead of
299+
// just ranking them lower. We skip them and rely on text_match ranking.
310300

311301
const filterBy = filterParts.join(' && ')
312302

313303
const searchParams: any = {
314304
q: query.$search,
315305
query_by: 'searchTitle,searchShortTitle,fulltextSummary',
316-
query_by_weights: '50,50,1',
306+
query_by_weights: '100,100,1',
317307
filter_by: filterBy,
318308
limit: size ?? DEFAULT_LIMIT,
319309
offset: from ?? 0,
320310
sort_by: '_text_match:desc',
321-
prefix: 'true,true,false'
311+
prefix: 'true,true,false',
312+
num_typos: '2,2,1',
313+
typo_tokens_threshold: 1,
314+
drop_tokens_threshold: 1
322315
}
323316

324317
const result = await ctx.with(
@@ -334,7 +327,7 @@ class TypesenseAdapter implements FullTextAdapter {
334327
return {
335328
...doc,
336329
id: this.getDocId(workspaceId, doc.id),
337-
_score: hit.text_match ?? 0
330+
_score: hit.text_match_info?.best_field_score ?? hit.text_match ?? 0
338331
}
339332
})
340333
} catch (err: any) {
@@ -356,13 +349,7 @@ class TypesenseAdapter implements FullTextAdapter {
356349
}
357350

358351
const fulltextId = this.getFulltextDocId(workspaceId, doc.id)
359-
const tsDoc: Record<string, any> = {
360-
...doc,
361-
id: fulltextId,
362-
workspaceId
363-
}
364-
// Remove binary data — Typesense cannot process it
365-
delete tsDoc.data
352+
const tsDoc = sanitizeDoc({ ...doc, id: fulltextId, workspaceId })
366353

367354
try {
368355
await this.client.collections(this.collectionName).documents().upsert(tsDoc)
@@ -406,12 +393,7 @@ class TypesenseAdapter implements FullTextAdapter {
406393
const batch = parts.splice(0, BATCH_SIZE)
407394
const jsonlLines = batch
408395
.map((doc) => {
409-
const tsDoc: Record<string, any> = {
410-
...doc,
411-
id: this.getFulltextDocId(workspaceId, doc.id),
412-
workspaceId
413-
}
414-
delete tsDoc.data
396+
const tsDoc = sanitizeDoc({ ...doc, id: this.getFulltextDocId(workspaceId, doc.id), workspaceId })
415397
return JSON.stringify(tsDoc)
416398
})
417399
.join('\n')
@@ -479,7 +461,16 @@ class TypesenseAdapter implements FullTextAdapter {
479461
while (remaining.length > 0) {
480462
const batch = remaining.splice(0, BATCH_SIZE)
481463
const jsonlLines = batch.map((doc: any) => JSON.stringify(doc)).join('\n')
482-
await this.client.collections(this.collectionName).documents().import(jsonlLines, { action: 'upsert' })
464+
const results = await this.client
465+
.collections(this.collectionName)
466+
.documents()
467+
.import(jsonlLines, { action: 'upsert' })
468+
const errors = (
469+
typeof results === 'string' ? results.split('\n').map((l: string) => JSON.parse(l)) : results
470+
).filter((r: any) => r.success === false)
471+
if (errors.length > 0) {
472+
console.error(`updateByQuery upsert errors: ${errors.map((e: any) => e.error).join('; ')}`)
473+
}
483474
}
484475
} catch (err: any) {
485476
if (isConnectionError(err)) {

pods/fulltext/src/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,11 @@ const config: FulltextDBConfiguration = {
108108
defaultContentAdapter: 'Rekoni'
109109
}
110110

111-
const elasticIndexName = process.env.ELASTIC_INDEX_NAME ?? 'huly_storage_index'
111+
const elasticIndexName = process.env.ELASTIC_INDEX_NAME
112+
if (elasticIndexName === undefined) {
113+
console.error('ELASTIC_INDEX_NAME should be specified')
114+
process.exit(1)
115+
}
112116

113117
const servicePort = parseInt(process.env.PORT ?? '4700')
114118
metricsContext.info('Starting stats service')

0 commit comments

Comments
 (0)