Skip to content

Commit 1ef0c01

Browse files
authored
Sort prefix matches above other contain matches (#2163)
* Sort prefix matches above other contain matches * Optimize fwdata sorting
1 parent 624ea9c commit 1ef0c01

7 files changed

Lines changed: 88 additions & 50 deletions

File tree

backend/FwLite/FwDataMiniLcmBridge/Api/Sorting.cs

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,32 @@ namespace FwDataMiniLcmBridge.Api;
77
internal static class Sorting
88
{
99
/// <summary>
10-
/// crude emulation of FTS search relevance
10+
/// Rough emulation of FTS search relevance. Headword matches come first, preferring
11+
/// prefix matches (e.g. when searching "tan" then "tanan" is before "matan"), then shorter, then alphabetical.
12+
/// See also: EntrySearchService.FilterAndRank for the FTS-based equivalent in LcmCrdt.
1113
/// </summary>
1214
public static IEnumerable<ILexEntry> ApplyRoughBestMatchOrder(this IEnumerable<ILexEntry> entries, SortOptions order, int sortWsHandle, string? query = null)
1315
{
16+
var projected = entries.Select(e => (Entry: e, Headword: e.LexEntryHeadword(sortWsHandle)));
1417
if (order.Ascending)
1518
{
16-
return entries
17-
.OrderByDescending(e => !string.IsNullOrEmpty(query) && (e.LexEntryHeadword(sortWsHandle)?.ContainsDiacriticMatch(query!) ?? false))
18-
.ThenBy(e => e.LexEntryHeadword(sortWsHandle)?.Length ?? 0)
19-
.ThenBy(e => e.LexEntryHeadword(sortWsHandle))
20-
.ThenBy(e => e.Id.Guid);
19+
return projected
20+
.OrderByDescending(x => !string.IsNullOrEmpty(query) && (x.Headword?.ContainsDiacriticMatch(query!) ?? false))
21+
.ThenByDescending(x => !string.IsNullOrEmpty(query) && (x.Headword?.StartsWithDiacriticMatch(query!) ?? false))
22+
.ThenBy(x => x.Headword?.Length ?? 0)
23+
.ThenBy(x => x.Headword)
24+
.ThenBy(x => x.Entry.Id.Guid)
25+
.Select(x => x.Entry);
2126
}
2227
else
2328
{
24-
return entries
25-
.OrderBy(e => !string.IsNullOrEmpty(query) && (e.LexEntryHeadword(sortWsHandle)?.ContainsDiacriticMatch(query!) ?? false))
26-
.ThenByDescending(e => e.LexEntryHeadword(sortWsHandle)?.Length ?? 0)
27-
.ThenByDescending(e => e.LexEntryHeadword(sortWsHandle))
28-
.ThenByDescending(e => e.Id.Guid);
29+
return projected
30+
.OrderBy(x => !string.IsNullOrEmpty(query) && (x.Headword?.ContainsDiacriticMatch(query!) ?? false))
31+
.ThenBy(x => !string.IsNullOrEmpty(query) && (x.Headword?.StartsWithDiacriticMatch(query!) ?? false))
32+
.ThenByDescending(x => x.Headword?.Length ?? 0)
33+
.ThenByDescending(x => x.Headword)
34+
.ThenByDescending(x => x.Entry.Id.Guid)
35+
.Select(x => x.Entry);
2936
}
3037
}
3138
}

backend/FwLite/LcmCrdt/Data/CustomSqliteFunctionInterceptor.cs

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ namespace LcmCrdt.Data;
1010
public class CustomSqliteFunctionInterceptor : IDbConnectionInterceptor, IConnectionInterceptor
1111
{
1212
public const string ContainsFunction = "contains";
13+
public const string StartsWithFunction = "startsWith";
1314

1415
public void ConnectionOpened(DbConnection connection, ConnectionEndEventData eventData)
1516
{
@@ -49,15 +50,13 @@ private void ConnectionOpened(DbConnection connection)
4950
{
5051
if (connection is SqliteConnection sqliteConnection)
5152
{
52-
RegisterContainsFunction(sqliteConnection);
53+
RegisterCustomFunctions(sqliteConnection);
5354
}
5455
}
5556

56-
public static void RegisterContainsFunction(SqliteConnection sqliteConnection)
57+
public static void RegisterCustomFunctions(SqliteConnection sqliteConnection)
5758
{
58-
//creates a new function that can be used in queries
5959
sqliteConnection.CreateFunction(ContainsFunction,
60-
//in sqlite strings are byte arrays, so we can avoid allocating strings by using spans
6160
(byte[]? str, byte[]? value) =>
6261
{
6362
if (str is null || value is null) return false;
@@ -66,13 +65,29 @@ public static void RegisterContainsFunction(SqliteConnection sqliteConnection)
6665
Span<char> search = stackalloc char[Encoding.UTF8.GetCharCount(value)];
6766
Encoding.UTF8.GetChars(str, source);
6867
Encoding.UTF8.GetChars(value, search);
69-
return CultureInfo.InvariantCulture.CompareInfo.IndexOf(source,
70-
search,
71-
ContainsDiacritic(search)
72-
? CompareOptions.IgnoreCase
73-
: CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase
74-
) >= 0;
68+
var options = DiacriticMatchOptions(search);
69+
return CultureInfo.InvariantCulture.CompareInfo.IndexOf(source, search, options) >= 0;
7570
});
71+
72+
sqliteConnection.CreateFunction(StartsWithFunction,
73+
(byte[]? str, byte[]? value) =>
74+
{
75+
if (str is null || value is null) return false;
76+
77+
Span<char> source = stackalloc char[Encoding.UTF8.GetCharCount(str)];
78+
Span<char> search = stackalloc char[Encoding.UTF8.GetCharCount(value)];
79+
Encoding.UTF8.GetChars(str, source);
80+
Encoding.UTF8.GetChars(value, search);
81+
var options = DiacriticMatchOptions(search);
82+
return CultureInfo.InvariantCulture.CompareInfo.IsPrefix(source, search, options);
83+
});
84+
}
85+
86+
private static CompareOptions DiacriticMatchOptions(in ReadOnlySpan<char> search)
87+
{
88+
return ContainsDiacritic(search)
89+
? CompareOptions.IgnoreCase
90+
: CompareOptions.IgnoreCase | CompareOptions.IgnoreNonSpace;
7691
}
7792

7893
private static bool ContainsDiacritic(in ReadOnlySpan<char> value)

backend/FwLite/LcmCrdt/Data/Sorting.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,17 @@ namespace LcmCrdt.Data;
33
public static class Sorting
44
{
55
/// <summary>
6-
/// crude emulation of FTS search relevance
6+
/// Rough search relevance for when FTS is unavailable. Headword matches come first, preferring
7+
/// prefix matches (e.g. when searching "tan" then "tanan" is before "matan"), then shorter, then alphabetical.
8+
/// See also: <see cref="FullTextSearch.EntrySearchService.FilterAndRank"/> for the FTS-based equivalent.
79
/// </summary>
810
public static IQueryable<Entry> ApplyRoughBestMatchOrder(this IQueryable<Entry> entries, SortOptions order, string? query = null)
911
{
1012
if (order.Ascending)
1113
{
1214
return entries
1315
.OrderByDescending(e => !string.IsNullOrEmpty(query) && SqlHelpers.ContainsIgnoreCaseAccents(e.Headword(order.WritingSystem), query!))
16+
.ThenByDescending(e => !string.IsNullOrEmpty(query) && SqlHelpers.StartsWithIgnoreCaseAccents(e.Headword(order.WritingSystem), query!))
1417
.ThenBy(e => e.Headword(order.WritingSystem).Length)
1518
.ThenBy(e => e.Headword(order.WritingSystem))
1619
.ThenBy(e => e.Id);
@@ -19,6 +22,7 @@ public static IQueryable<Entry> ApplyRoughBestMatchOrder(this IQueryable<Entry>
1922
{
2023
return entries
2124
.OrderBy(e => !string.IsNullOrEmpty(query) && SqlHelpers.ContainsIgnoreCaseAccents(e.Headword(order.WritingSystem), query!))
25+
.ThenBy(e => !string.IsNullOrEmpty(query) && SqlHelpers.StartsWithIgnoreCaseAccents(e.Headword(order.WritingSystem), query!))
2226
.ThenByDescending(e => e.Headword(order.WritingSystem).Length)
2327
.ThenByDescending(e => e.Headword(order.WritingSystem))
2428
.ThenByDescending(e => e.Id);

backend/FwLite/LcmCrdt/FullTextSearch/EntrySearchService.cs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,32 +34,31 @@ public IQueryable<Entry> Filter(IQueryable<Entry> queryable, string query)
3434
return FilterInternal(queryable, query).Select(t => t.Entry);
3535
}
3636

37+
/// <summary>
38+
/// Filters and ranks entries using FTS. Headword matches come first, preferring prefix matches
39+
/// (e.g. when searching "tan" then "tanan" is before "matan"), then shorter headwords, then alphabetical.
40+
/// Non-headword matches (gloss, definition) fall back to FTS rank.
41+
/// See also: <see cref="Sorting.ApplyRoughBestMatchOrder"/> for the non-FTS equivalent.
42+
/// </summary>
3743
public IQueryable<Entry> FilterAndRank(IQueryable<Entry> queryable,
3844
string query,
3945
WritingSystemId wsId)
4046
{
4147
var filtered = FilterInternal(queryable, query);
4248
var ordered = filtered
43-
.OrderBy(t =>
44-
// First headword matches... (this allows headword matches to trump long text penalizations)
45-
t.HeadwordMatches
46-
// ...in order of length (e.g. so exact matches are first).
47-
? t.SearchRecord.Headword.Length
48-
// Everything else falls back to FTS rank.
49-
: int.MaxValue)
49+
.OrderByDescending(t => t.HeadwordMatches)
50+
.ThenByDescending(t => t.HeadwordPrefixMatches)
51+
.ThenBy(t => t.HeadwordMatches ? t.SearchRecord.Headword.Length : int.MaxValue)
5052
.ThenBy(t =>
51-
// For headword matches of identical length...
5253
t.HeadwordMatches
53-
// ...order by headword text (this prevents confusing results like "maap", "baap", "maap").
5454
? t.SearchRecord.Headword.CollateUnicode(wsId)
55-
// Everything else falls back to FTS rank.
5655
: string.Empty)
5756
.ThenBy(t => Sql.Ext.SQLite().Rank(t.SearchRecord)).ThenBy(t => t.Entry.Id);
5857

5958
return ordered.Select(t => t.Entry);
6059
}
6160

62-
private sealed record FilterProjection(Entry Entry, EntrySearchRecord SearchRecord, bool HeadwordMatches);
61+
private sealed record FilterProjection(Entry Entry, EntrySearchRecord SearchRecord, bool HeadwordMatches, bool HeadwordPrefixMatches);
6362

6463
private IQueryable<FilterProjection> FilterInternal(IQueryable<Entry> queryable, string query)
6564
{
@@ -74,7 +73,8 @@ where Sql.Ext.SQLite().Match(searchRecord, ftsString) &&
7473
|| entry.CitationForm.SearchValue(query)
7574
|| entry.Senses.Any(s => s.Gloss.SearchValue(query)))
7675
let headwordMatches = SqlHelpers.ContainsIgnoreCaseAccents(searchRecord.Headword, query)
77-
select new FilterProjection(entry, searchRecord, headwordMatches);
76+
let headwordPrefixMatches = SqlHelpers.StartsWithIgnoreCaseAccents(searchRecord.Headword, query)
77+
select new FilterProjection(entry, searchRecord, headwordMatches, headwordPrefixMatches);
7878
}
7979

8080
private static string ToFts5LiteralString(string query)

backend/FwLite/LcmCrdt/SqlHelpers.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,7 @@ private static Expression<Func<MultiString, string, bool>> SearchValueExpression
3333

3434
[Sql.Expression(CustomSqliteFunctionInterceptor.ContainsFunction + "({0}, {1})")]
3535
public static bool ContainsIgnoreCaseAccents(string s, string search) => s.ContainsDiacriticMatch(search);
36+
37+
[Sql.Expression(CustomSqliteFunctionInterceptor.StartsWithFunction + "({0}, {1})")]
38+
public static bool StartsWithIgnoreCaseAccents(string s, string search) => s.StartsWithDiacriticMatch(search);
3639
}

backend/FwLite/MiniLcm.Tests/QueryEntryTestsBase.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,10 +439,10 @@ public async Task NegativeMatches(string searchTerm, string word)
439439
[InlineData("word1", "word1", "word1")]
440440
[InlineData("app", "app,apple,banana", "app,apple")]
441441
[InlineData("apple", "app,apple,banana", "apple")]
442-
[InlineData("att", "battery,att,attack,zatt,rap:pratt", "att,zatt,attack,battery,rap")]
442+
[InlineData("att", "battery,att,attack,zatt,rap:pratt", "att,attack,zatt,battery,rap")]
443443
[InlineData("a", "a,da,ma,aa,c:a,ti:a", "a,aa,da,ma,c,ti")]//test non fts search
444444
[InlineData("ap", "app,apple,banana", "app,apple")]//test non fts search
445-
[InlineData("at", "battery,att,attack,zatt,rap:pratt", "att,zatt,attack,battery,rap")] //test non fts search
445+
[InlineData("at", "battery,att,attack,zatt,rap:pratt", "att,attack,zatt,battery,rap")] //test non fts search
446446
// matching headwords trump glosses, even if the headword match is penalized for other long fields
447447
[InlineData("aap", "maap-aap,maap,liaap,aap:to-penalize-the-bm25-rank-this-gloss-is-very-very-very-very-very-very-very-very-very-long", "aap,maap,liaap,maap-aap")]
448448
// matching headwords of the same length are ordered alphabetically

backend/FwLite/MiniLcm/Culture/StringExtensions.cs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,36 @@ namespace MiniLcm.Culture;
55

66
public static class StringExtensions
77
{
8-
public static bool Contains(this string str, string value, CultureInfo cultureInfo, CompareOptions comparison = CompareOptions.None)
8+
/// <summary>
9+
/// Checks if <paramref name="str"/> contains <paramref name="search"/>, ignoring case.
10+
/// Diacritics are also ignored unless the search string itself contains diacritics.
11+
/// </summary>
12+
public static bool ContainsDiacriticMatch(this string str, string search)
913
{
10-
return cultureInfo.CompareInfo.IndexOf(str, value, comparison) >= 0;
14+
var options = DiacriticMatchOptions(search);
15+
return CultureInfo.InvariantCulture.CompareInfo.Contains(str, search, options);
1116
}
1217

1318
/// <summary>
14-
/// searches a string for a match ignoring diacritics, but only when the search string does not contain diacritics
19+
/// Checks if <paramref name="str"/> starts with <paramref name="search"/>, ignoring case.
20+
/// Diacritics are also ignored unless the search string itself contains diacritics.
1521
/// </summary>
16-
/// <param name="str">source of the search</param>
17-
/// <param name="search">string to search for</param>
18-
public static bool ContainsDiacriticMatch(this string str, string search)
22+
public static bool StartsWithDiacriticMatch(this string str, string search)
1923
{
20-
if (ContainsDiacritic(search))
21-
{
22-
return Contains(str, search, CultureInfo.InvariantCulture, CompareOptions.IgnoreCase);
23-
}
24+
var options = DiacriticMatchOptions(search);
25+
return CultureInfo.InvariantCulture.CompareInfo.IsPrefix(str, search, options);
26+
}
2427

25-
return Contains(str,
26-
search,
27-
CultureInfo.InvariantCulture,
28-
CompareOptions.IgnoreCase | CompareOptions.IgnoreNonSpace);
28+
private static bool Contains(this CompareInfo compareInfo, string source, string value, CompareOptions options)
29+
{
30+
return compareInfo.IndexOf(source, value, options) >= 0;
31+
}
32+
33+
private static CompareOptions DiacriticMatchOptions(string search)
34+
{
35+
return ContainsDiacritic(search)
36+
? CompareOptions.IgnoreCase
37+
: CompareOptions.IgnoreCase | CompareOptions.IgnoreNonSpace;
2938
}
3039

3140
public static bool ContainsDiacritic(string value)

0 commit comments

Comments
 (0)