Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backend/FwLite/FwDataMiniLcmBridge/Api/LcmHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Globalization;
using MiniLcm.Culture;
using MiniLcm.Models;
using SIL.LCModel;
using SIL.LCModel.Core.KernelInterfaces;
Expand All @@ -13,7 +14,7 @@ internal static bool SearchValue(this ITsMultiString multiString, string value)
{
var tsString = multiString.GetStringFromIndex(i, out var _);
if (string.IsNullOrEmpty(tsString.Text)) continue;
if (tsString.Text.Contains(value, StringComparison.InvariantCultureIgnoreCase))
if (tsString.Text.ContainsDiacriticMatch(value))
{
return true;
}
Expand Down
33 changes: 31 additions & 2 deletions backend/FwLite/LcmCrdt.Tests/MiniLcmTests/QueryEntryTests.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
namespace LcmCrdt.Tests.MiniLcmTests;
using System.Diagnostics;
using Xunit.Abstractions;

public class QueryEntryTests : QueryEntryTestsBase
namespace LcmCrdt.Tests.MiniLcmTests;

public class QueryEntryTests(ITestOutputHelper outputHelper) : QueryEntryTestsBase
{
private readonly MiniLcmApiFixture _fixture = new();

Expand All @@ -11,6 +14,32 @@ protected override async Task<IMiniLcmApi> NewApi()
return api;
}



[Theory]
[InlineData(50_000)]
[InlineData(100_000)]
public async Task QueryPerformanceTesting(int count)
{
await _fixture.Api.BulkCreateEntries(AsyncEnumerable.Range(0, count).Select(i => new Entry { LexemeForm = { ["en"] = Guid.NewGuid().ToString() } }));

var testIterations = 10;
var startTimestamp = Stopwatch.GetTimestamp();
for (int i = 0; i < testIterations; i++)
{
//search should not match anything as we only want to test the match performance
var results = await Api.SearchEntries("asdfgbope").ToArrayAsync();
results.Should().BeEmpty();
}

var totalRuntime = Stopwatch.GetElapsedTime(startTimestamp);
var queryTime = totalRuntime / testIterations;
var timePerEntry = queryTime / count;
outputHelper.WriteLine(
$"Total query time: {queryTime.TotalMilliseconds}ms, time per entry: {timePerEntry.TotalMicroseconds}microseconds");
timePerEntry.TotalMicroseconds.Should().BeLessThan(10);//on my machine I got 3.9, so this is a safe margin
}

public override async Task DisposeAsync()
{
await base.DisposeAsync();
Expand Down
61 changes: 61 additions & 0 deletions backend/FwLite/LcmCrdt/Data/CustomSqliteFunctionInterceptor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
using System.Data.Common;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Text;
using Microsoft.Data.Sqlite;
using Microsoft.EntityFrameworkCore.Diagnostics;
using MiniLcm.Culture;

namespace LcmCrdt.Data;

public class CustomSqliteFunctionInterceptor : IDbConnectionInterceptor
{
public const string ContainsFunction = "contains";

public void ConnectionOpened(DbConnection connection, ConnectionEndEventData eventData)
{
var sqliteConnection = (SqliteConnection)connection;
Comment thread
hahn-kev marked this conversation as resolved.
//creates a new function that can be used in queries
sqliteConnection.CreateFunction(ContainsFunction,
//in sqlite strings are byte arrays, so we can avoid allocating strings by using spans
(byte[]? str, byte[]? value) =>
{
if (str is null || value is null) return false;

Span<char> source = stackalloc char[Encoding.UTF8.GetCharCount(str)];
Span<char> search = stackalloc char[Encoding.UTF8.GetCharCount(value)];
Encoding.UTF8.GetChars(str, source);
Encoding.UTF8.GetChars(value, search);
return CultureInfo.InvariantCulture.CompareInfo.IndexOf(source,
search,
ContainsDiacritic(search)
? CompareOptions.IgnoreCase
: CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreCase
) >= 0;
});
}

private static bool ContainsDiacritic(in ReadOnlySpan<char> value)
{
bool hasAccent = false;
//todo we could maybe get rid of this normalization step if the text is already normalized
//that would mean we could just iterate the value here rather than creating a new string
foreach (var ch in new string(value).Normalize(NormalizationForm.FormD))
{
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark)
{
hasAccent = true;
break;
}
}
return hasAccent;
}

public Task ConnectionOpenedAsync(DbConnection connection,
ConnectionEndEventData eventData,
CancellationToken cancellationToken = new CancellationToken())
{
ConnectionOpened(connection, eventData);
return Task.CompletedTask;
}
}
2 changes: 1 addition & 1 deletion backend/FwLite/LcmCrdt/LcmCrdtDbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public class LcmCrdtDbContext(DbContextOptions<LcmCrdtDbContext> dbContextOption
public IQueryable<WritingSystem> WritingSystems => Set<WritingSystem>().AsNoTracking();
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
optionsBuilder.AddInterceptors(setupCollationInterceptor);
optionsBuilder.AddInterceptors(setupCollationInterceptor, new CustomSqliteFunctionInterceptor());
}

protected override void OnModelCreating(ModelBuilder modelBuilder)
Expand Down
13 changes: 10 additions & 3 deletions backend/FwLite/LcmCrdt/SqlHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
using System.Linq.Expressions;
using System.Globalization;
using System.Linq.Expressions;
using LcmCrdt.Data;
using LinqToDB;
using LinqToDB.DataProvider.SQLite;
using MiniLcm.Culture;

namespace LcmCrdt;

Expand All @@ -19,11 +23,14 @@ private static Expression<Func<MultiString, string, bool>> HasValueExpression()
[ExpressionMethod(nameof(SearchValueExpression))]
public static bool SearchValue(this MultiString ms, string search)
{
return ms.Values.Any(pair => pair.Value.Contains(search));
return ms.Values.Any(pair => pair.Value.ContainsDiacriticMatch(search));
}

private static Expression<Func<MultiString, string, bool>> SearchValueExpression()
{
return (ms, search) => Json.QueryValues(ms).Any(s => s.Contains(search));
return (ms, search) => Json.QueryValues(ms).Any(s => ContainsIgnoreCaseAccents(s, search));
}

[Sql.Expression(CustomSqliteFunctionInterceptor.ContainsFunction + "({0}, {1})")]
private static bool ContainsIgnoreCaseAccents(string s, string search) => s.ContainsDiacriticMatch(search);
}
41 changes: 40 additions & 1 deletion backend/FwLite/MiniLcm.Tests/QueryEntryTestsBase.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
namespace MiniLcm.Tests;
using System.Text;

namespace MiniLcm.Tests;

public abstract class QueryEntryTestsBase : MiniLcmTestBase
{
Expand Down Expand Up @@ -211,4 +213,41 @@ public async Task CanFilterExampleSentenceText()
var results = await Api.GetEntries(new(Filter: new() { GridifyFilter = "Senses.ExampleSentences.Sentence[en]=*phone" })).ToArrayAsync();
results.Select(e => e.LexemeForm["en"]).Should().BeEquivalentTo(Banana);
}

[Theory]
[InlineData("a", "a")]
[InlineData("a", "A")]
[InlineData("A", "Ã")]
[InlineData("ap", "apple")]
[InlineData("ap", "APPLE")]
[InlineData("ing", "walking")]
[InlineData("ing", "WALKING")]
[InlineData("Ãp", "Ãpple")]
[InlineData("Ãp", "ãpple")]
[InlineData("ap", "Ãpple")]
public async Task SuccessfulMatches(string searchTerm, string word)
{
word = word.Normalize(NormalizationForm.FormD);
//should we be normalizing the search term internally?
searchTerm = searchTerm.Normalize(NormalizationForm.FormD);
await Api.CreateEntry(new Entry { LexemeForm = { ["en"] = word } });
var words = await Api.SearchEntries(searchTerm).Select(e => e.LexemeForm["en"]).ToArrayAsync();
words.Should().Contain(word);
}

[Theory]
[InlineData("a", "b")]
[InlineData("ab", "b")]
[InlineData("Ã", "A")] // Accented should not match base
[InlineData("apple", "orange")] // Completely different words
[InlineData("É", "È")] // Different accents
public async Task NegativeMatches(string searchTerm, string word)
{
word = word.Normalize(NormalizationForm.FormD);
//should we be normalizing the search term internally?
searchTerm = searchTerm.Normalize(NormalizationForm.FormD);
await Api.CreateEntry(new Entry { LexemeForm = { ["en"] = word } });
var words = await Api.SearchEntries(searchTerm).Select(e => e.LexemeForm["en"]).ToArrayAsync();
words.Should().NotContain(word);
}
}
52 changes: 52 additions & 0 deletions backend/FwLite/MiniLcm/Culture/StringExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using System.Globalization;
using System.Text;

namespace MiniLcm.Culture;

public static class StringExtensions
{
public static bool Contains(this string str, string value, CultureInfo cultureInfo, CompareOptions comparison = CompareOptions.None)
{
return cultureInfo.CompareInfo.IndexOf(str, value, comparison) >= 0;
}

/// <summary>
/// searches a string for a match ignoring diacritics, but only when the search string does not contain diacritics
/// </summary>
/// <param name="str">source of the search</param>
/// <param name="search">string to search for</param>
public static bool ContainsDiacriticMatch(this string str, string search)
{
if (ContainsDiacritic(search))
{
return Contains(str, search, CultureInfo.InvariantCulture, CompareOptions.IgnoreCase);
}

return Contains(str,
search,
CultureInfo.InvariantCulture,
CompareOptions.IgnoreCase | CompareOptions.IgnoreNonSpace);
}

public static bool ContainsDiacritic(string value)
{
bool hasAccent = false;
foreach (var ch in value.Normalize(NormalizationForm.FormD))
{
if (CharUnicodeInfo.GetUnicodeCategory(ch) == UnicodeCategory.NonSpacingMark)
{
hasAccent = true;
break;
}
}
return hasAccent;
}

public static bool Equals(this string str,
string value,
CultureInfo cultureInfo,
CompareOptions comparison = CompareOptions.None)
{
return cultureInfo.CompareInfo.Compare(str, value, comparison) == 0;
}
}
Loading